Unverified Commit d044c656 authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #2275 from peastman/molnet

Converted more molnet loaders to new API
parents 5dbf3b0a 2dbbeef4
Loading
Loading
Loading
Loading
+102 −159
Original line number Diff line number Diff line
@@ -4,108 +4,98 @@ BBBC Dataset loader.
This file contains image loaders for the BBBC dataset collection (https://data.broadinstitute.org/bbbc/image_sets.html).
"""
import os
import numpy as np
import logging
import deepchem
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
BBBC1_IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_images_tif.zip'
BBBC1_LABEL_URL = 'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_counts.txt'
BBBC1_TASKS = ["cell-count"]

BBBC2_IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_images.zip'
BBBC2_LABEL_URL = 'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_counts.txt'
BBBC2_TASKS = ["cell-count"]


class _BBBC001Loader(_MolnetLoader):

def load_bbbc001(split='index',
                 reload=True,
                 data_dir=None,
                 save_dir=None,
                 **kwargs):
  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, "BBBC001_v1_images_tif.zip")
    labels_file = os.path.join(self.data_dir, "BBBC001_v1_counts.txt")
    if not os.path.exists(dataset_file):
      dc.utils.data_utils.download_url(
          url=BBBC1_IMAGE_URL, dest_dir=self.data_dir)
    if not os.path.exists(labels_file):
      dc.utils.data_utils.download_url(
          url=BBBC1_LABEL_URL, dest_dir=self.data_dir)
    loader = dc.data.ImageLoader()
    return loader.create_dataset(dataset_file, in_memory=False)


def load_bbbc001(
    splitter: Union[dc.splits.Splitter, str, None] = 'index',
    transformers: List[Union[TransformerGenerator, str]] = [],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """Load BBBC001 dataset

  This dataset contains 6 images of human HT29 colon cancer cells. The task is
  to learn to predict the cell counts in these images. This dataset is too small
  to serve to train algorithms, but might serve as a good test dataset.
  https://data.broadinstitute.org/bbbc/BBBC001/

  Parameters
  ----------
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in
  """
  # Featurize BBBC001 dataset
  bbbc001_tasks = ["cell-count"]

  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if reload:
    save_folder = os.path.join(save_dir, "bbbc001-featurized", str(split))
    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bbbc001_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "BBBC001_v1_images_tif.zip")
  labels_file = os.path.join(data_dir, "BBBC001_v1_counts.txt")
  featurizer = dc.feat.UserDefinedFeaturizer([])  # Not actually used
  loader = _BBBC001Loader(featurizer, splitter, transformers, BBBC1_TASKS,
                          data_dir, save_dir, **kwargs)
  return loader.load_dataset('bbbc001', reload)


class _BBBC002Loader(_MolnetLoader):

  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, "BBBC002_v1_images.zip")
    labels_file = os.path.join(self.data_dir, "BBBC002_v1_counts.txt.txt")
    if not os.path.exists(dataset_file):
    deepchem.utils.data_utils.download_url(
        url=BBBC1_IMAGE_URL, dest_dir=data_dir)
      dc.utils.data_utils.download_url(
          url=BBBC2_IMAGE_URL, dest_dir=self.data_dir)
    if not os.path.exists(labels_file):
    deepchem.utils.data_utils.download_url(
        url=BBBC1_LABEL_URL, dest_dir=data_dir)
  # Featurize Images into NumpyArrays
  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file, in_memory=False)

  # Load text file with labels
  with open(labels_file) as f:
    content = f.readlines()
  # Strip the first line which holds field labels
  lines = [x.strip() for x in content][1:]
  # Format is: Image_name count1 count2
  lines = [x.split("\t") for x in lines]
  counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
  y = np.array(counts)

  # This is kludgy way to add y to dataset. Can be done better?
  dataset = deepchem.data.DiskDataset.from_numpy(dataset.X, y)

  if split == None:
    transformers = []
    logger.info("Split is None, no transformers used for the dataset.")
    return bbbc001_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  if split not in splitters:
    raise ValueError("Only index and random splits supported.")
  splitter = splitters[split]

  logger.info("About to split dataset with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)
  transformers = []
  all_dataset = (train, valid, test)
  if reload:
    deepchem.utils.data_utils.save_dataset_to_disk(save_folder, train, valid,
                                                   test, transformers)
  return bbbc001_tasks, all_dataset, transformers


def load_bbbc002(split='index',
                 reload=True,
                 data_dir=None,
                 save_dir=None,
                 **kwargs):
      dc.utils.data_utils.download_url(
          url=BBBC2_LABEL_URL, dest_dir=self.data_dir)
    loader = dc.data.ImageLoader()
    return loader.create_dataset(dataset_file, in_memory=False)


def load_bbbc002(
    splitter: Union[dc.splits.Splitter, str, None] = 'index',
    transformers: List[Union[TransformerGenerator, str]] = [],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """Load BBBC002 dataset

  This dataset contains data corresponding to 5 samples of Drosophilia Kc167
@@ -113,74 +103,27 @@ def load_bbbc002(split='index',
  512x512. Ground truth labels contain cell counts for this dataset. Full
  details about this dataset are present at
  https://data.broadinstitute.org/bbbc/BBBC002/.
  """
  # Featurize BBBC002 dataset
  bbbc002_tasks = ["cell-count"]

  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if reload:
    save_folder = os.path.join(save_dir, "bbbc002-featurized", str(split))
    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bbbc002_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "BBBC002_v1_images.zip")
  labels_file = os.path.join(data_dir, "BBBC002_v1_counts.txt")

  if not os.path.exists(dataset_file):
    deepchem.utils.data_utils.download_url(
        url=BBBC2_IMAGE_URL, dest_dir=data_dir)
  if not os.path.exists(labels_file):
    deepchem.utils.data_utils.download_url(
        url=BBBC2_LABEL_URL, dest_dir=data_dir)
  # Featurize Images into NumpyArrays
  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file, in_memory=False)

  # Load text file with labels
  with open(labels_file) as f:
    content = f.readlines()
  # Strip the first line which holds field labels
  lines = [x.strip() for x in content][1:]
  # Format is: Image_name count1 count2
  lines = [x.split("\t") for x in lines]
  counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
  y = np.reshape(np.array(counts), (len(counts), 1))
  ids = [x[0] for x in lines]

  # This is kludgy way to add y to dataset. Can be done better?
  dataset = deepchem.data.DiskDataset.from_numpy(dataset.X, y, ids=ids)

  if split == None:
    transformers = []
    logger.info("Split is None, no transformers used for the dataset.")
    return bbbc002_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  if split not in splitters:
    raise ValueError("Only index and random splits supported.")
  splitter = splitters[split]

  logger.info("About to split dataset with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)
  all_dataset = (train, valid, test)
  transformers = []
  if reload:
    deepchem.utils.data_utils.save_dataset_to_disk(save_folder, train, valid,
                                                   test, transformers)
  return bbbc002_tasks, all_dataset, transformers
  Parameters
  ----------
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in
  """
  featurizer = dc.feat.UserDefinedFeaturizer([])  # Not actually used
  loader = _BBBC002Loader(featurizer, splitter, transformers, BBBC2_TASKS,
                          data_dir, save_dir, **kwargs)
  return loader.load_dataset('bbbc002', reload)
+50 −94
Original line number Diff line number Diff line
@@ -2,21 +2,35 @@
Blood-Brain Barrier Penetration dataset loader.
"""
import os
import logging
import deepchem
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
BBBP_URL = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv"
BBBP_TASKS = ["p_np"]


class _BBBPLoader(_MolnetLoader):

def load_bbbp(featurizer='ECFP',
              split='random',
              reload=True,
              data_dir=None,
              save_dir=None,
              **kwargs):
  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, "BBBP.csv")
    if not os.path.exists(dataset_file):
      dc.utils.data_utils.download_url(url=BBBP_URL, dest_dir=self.data_dir)
    loader = dc.data.CSVLoader(
        tasks=self.tasks, feature_field="smiles", featurizer=self.featurizer)
    return loader.create_dataset(dataset_file, shard_size=8192)


def load_bbbp(
    featurizer: Union[dc.feat.Featurizer, str] = 'ECFP',
    splitter: Union[dc.splits.Splitter, str, None] = 'scaffold',
    transformers: List[Union[TransformerGenerator, str]] = ['balancing'],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """Load BBBP dataset

  The blood-brain barrier penetration (BBBP) dataset is designed for the
@@ -37,92 +51,34 @@ def load_bbbp(featurizer='ECFP',
  - "smiles" - SMILES representation of the molecular structure
  - "p_np" - Binary labels for penetration/non-penetration

  Parameters
  ----------
  featurizer: Featurizer or str
    the featurizer to use for processing the data.  Alternatively you can pass
    one of the names from dc.molnet.featurizers as a shortcut.
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in

  References
  ----------
  .. [1] Martins, Ines Filipa, et al. "A Bayesian approach to in silico
     blood-brain barrier penetration modeling." Journal of chemical
     information and modeling 52.6 (2012): 1686-1697.
  """

  # Featurize bbb dataset
  logger.info("About to featurize bbbp dataset.")
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  bbbp_tasks = ["p_np"]

  if reload:
    save_folder = os.path.join(save_dir, "bbbp-featurized", featurizer)
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bbbp_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "BBBP.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.data_utils.download_url(url=BBBP_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=bbbp_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  if split is None:
    # Initialize transformers
    transformers = [deepchem.trans.BalancingTransformer(dataset=dataset)]

    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return bbbp_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split data with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)

  # Initialize transformers
  transformers = [deepchem.trans.BalancingTransformer(dataset=train)]

  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.data_utils.save_dataset_to_disk(save_folder, train, valid,
                                                   test, transformers)
  return bbbp_tasks, (train, valid, test), transformers
  loader = _BBBPLoader(featurizer, splitter, transformers, BBBP_TASKS, data_dir,
                       save_dir, **kwargs)
  return loader.load_dataset('bbbp', reload)
+48 −61
Original line number Diff line number Diff line
@@ -6,72 +6,59 @@ http://www.robots.ox.ac.uk/~vgg/research/counting/index_org.html. Labels aren't
available for this dataset, so only raw images are provided.
"""
import os
import logging
import deepchem
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union

logger = logging.getLogger(__name__)
CELL_COUNTING_URL = 'http://www.robots.ox.ac.uk/~vgg/research/counting/cells.zip'
CELL_COUNTING_TASKS: List[str] = []

DEFAULT_DIR = deepchem.utils.data_utils.get_data_dir()
DATASET_URL = 'http://www.robots.ox.ac.uk/~vgg/research/counting/cells.zip'

class _CellCountingLoader(_MolnetLoader):

def load_cell_counting(split=None,
                       reload=True,
                       data_dir=None,
                       save_dir=None,
                       **kwargs):
  """Load Cell Counting dataset.

  Loads the cell counting dataset from http://www.robots.ox.ac.uk/~vgg/research/counting/index_org.html.
  """
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR
  # No tasks since no labels provided.
  cell_counting_tasks = []
  # For now images are loaded directly by ImageLoader
  featurizer = ""
  if reload:
    save_folder = os.path.join(save_dir, "cell_counting-featurized", str(split))
    loaded, all_dataset, transformers = deepchem.utils.data_utils.load_dataset_from_disk(
        save_folder)
    if loaded:
      return cell_counting_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "cells.zip")
  def create_dataset(self) -> Dataset:
    dataset_file = os.path.join(self.data_dir, "cells.zip")
    if not os.path.exists(dataset_file):
    deepchem.utils.data_utils.download_url(url=DATASET_URL, dest_dir=data_dir)

  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file)
      dc.utils.data_utils.download_url(
          url=CELL_COUNTING_URL, dest_dir=self.data_dir)
    loader = dc.data.ImageLoader()
    return loader.featurize(dataset_file)

  transformers = []

  if split == None:
    logger.info("Split is None, no transformers used.")
    return cell_counting_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  if split not in splitters:
    raise ValueError("Only index and random splits supported.")
  splitter = splitters[split]
def load_cell_counting(
    splitter: Union[dc.splits.Splitter, str, None] = None,
    transformers: List[Union[TransformerGenerator, str]] = [],
    reload: bool = True,
    data_dir: Optional[str] = None,
    save_dir: Optional[str] = None,
    **kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
  """Load Cell Counting dataset.

  logger.info("About to split dataset with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)
  Loads the cell counting dataset from http://www.robots.ox.ac.uk/~vgg/research/counting/index_org.html.

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)
  transformers = []
  all_dataset = (train, valid, test)
  if reload:
    deepchem.utils.data_utils.save_dataset_to_disk(save_folder, train, valid,
                                                   test, transformers)
  return cell_counting_tasks, all_dataset, transformers
  Parameters
  ----------
  splitter: Splitter or str
    the splitter to use for splitting the data into training, validation, and
    test sets.  Alternatively you can pass one of the names from
    dc.molnet.splitters as a shortcut.  If this is None, all the data
    will be included in a single dataset.
  transformers: list of TransformerGenerators or strings
    the Transformers to apply to the data.  Each one is specified by a
    TransformerGenerator or, as a shortcut, one of the names from
    dc.molnet.transformers.
  reload: bool
    if True, the first call for a particular featurizer and splitter will cache
    the datasets to disk, and subsequent calls will reload the cached datasets.
  data_dir: str
    a directory to save the raw data in
  save_dir: str
    a directory to save the dataset in
  """
  featurizer = dc.feat.UserDefinedFeaturizer([])  # Not actually used
  loader = _CellCountingLoader(featurizer, splitter, transformers,
                               CELL_COUNTING_TASKS, data_dir, save_dir,
                               **kwargs)
  return loader.load_dataset('cell_counting', reload)
+56 −110

File changed.

Preview size limit exceeded, changes collapsed.

+51 −97

File changed.

Preview size limit exceeded, changes collapsed.

Loading