Commit 0b116b4b authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Partial move

parent 9277d100
Loading
Loading
Loading
Loading
+0 −0

Empty file deleted.

+0 −220
Original line number Diff line number Diff line
"""
bace dataset loader.
"""
import os
import logging
import deepchem
from deepchem.molnet.load_function.bace_features import bace_user_specified_features

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
BACE_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv'


def load_bace_regression(featurizer='ECFP',
                         split='random',
                         reload=True,
                         move_mean=True,
                         data_dir=None,
                         save_dir=None,
                         **kwargs):
  """Load bace datasets."""
  # Featurize bace dataset
  logger.info("About to featurize bace dataset.")
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  bace_tasks = ["pIC50"]

  if reload:
    save_folder = os.path.join(save_dir, "bace_r-featurized")
    if not move_mean:
      save_folder = os.path.join(save_folder, str(featurizer) + "_mean_unmoved")
    else:
      save_folder = os.path.join(save_folder, str(featurizer))

    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bace_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "bace.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=BACE_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == 'UserDefined':
    featurizer = deepchem.feat.UserDefinedFeaturizer(
        bace_user_specified_features)
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, smiles_field="mol", featurizer=featurizer)

  dataset = loader.featurize(dataset_file, shard_size=8192)
  if split is None:
    # Initialize transformers
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset, move_mean=move_mean)
    ]

    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return bace_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'stratified': deepchem.splits.SingletaskStratifiedSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split data using {} splitter".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)

  transformers = [
      deepchem.trans.NormalizationTransformer(
          transform_y=True, dataset=train, move_mean=move_mean)
  ]

  logger.info("About to transform data.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bace_tasks, (train, valid, test), transformers


def load_bace_classification(featurizer='ECFP',
                             split='random',
                             reload=True,
                             data_dir=None,
                             save_dir=None,
                             **kwargs):
  """Load bace datasets."""
  # Featurize bace dataset
  logger.info("About to featurize bace dataset.")
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  bace_tasks = ["Class"]

  if reload:
    save_folder = os.path.join(save_dir, "bace_c-featurized", str(featurizer))
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bace_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "bace.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=BACE_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == 'UserDefined':
    featurizer = deepchem.feat.UserDefinedFeaturizer(
        bace_user_specified_features)
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, smiles_field="mol", featurizer=featurizer)

  dataset = loader.featurize(dataset_file, shard_size=8192)

  if split is None:
    # Initialize transformers
    transformers = [
        deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]

    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return bace_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'stratified': deepchem.splits.RandomStratifiedSplitter()
  }

  splitter = splitters[split]
  logger.info("About to split data using {} splitter".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)

  transformers = [
      deepchem.trans.BalancingTransformer(transform_w=True, dataset=train)
  ]

  logger.info("About to transform data.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bace_tasks, (train, valid, test), transformers
+0 −182
Original line number Diff line number Diff line
"""
BBBC Dataset loader.

This file contains image loaders for the BBBC dataset collection (https://data.broadinstitute.org/bbbc/image_sets.html).
"""
import os
import numpy as np
import logging
import deepchem

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
BBBC1_IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_images_tif.zip'
BBBC1_LABEL_URL = 'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_counts.txt'

BBBC2_IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_images.zip'
BBBC2_LABEL_URL = 'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_counts.txt'


def load_bbbc001(split='index',
                 reload=True,
                 data_dir=None,
                 save_dir=None,
                 **kwargs):
  """Load BBBC001 dataset

  This dataset contains 6 images of human HT29 colon cancer cells. The task is
  to learn to predict the cell counts in these images. This dataset is too small
   to serve to train algorithms, but might serve as a good test dataset.
   https://data.broadinstitute.org/bbbc/BBBC001/
  """
  # Featurize BBBC001 dataset
  bbbc001_tasks = ["cell-count"]

  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if reload:
    save_folder = os.path.join(save_dir, "bbbc001-featurized", str(split))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bbbc001_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "BBBC001_v1_images_tif.zip")
  labels_file = os.path.join(data_dir, "BBBC001_v1_counts.txt")

  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=BBBC1_IMAGE_URL, dest_dir=data_dir)
  if not os.path.exists(labels_file):
    deepchem.utils.download_url(url=BBBC1_LABEL_URL, dest_dir=data_dir)
  # Featurize Images into NumpyArrays
  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file, in_memory=False)

  # Load text file with labels
  with open(labels_file) as f:
    content = f.readlines()
  # Strip the first line which holds field labels
  lines = [x.strip() for x in content][1:]
  # Format is: Image_name count1 count2
  lines = [x.split("\t") for x in lines]
  counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
  y = np.array(counts)

  # This is kludgy way to add y to dataset. Can be done better?
  dataset = deepchem.data.DiskDataset.from_numpy(dataset.X, y)

  if split == None:
    transformers = []
    logger.info("Split is None, no transformers used for the dataset.")
    return bbbc001_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  if split not in splitters:
    raise ValueError("Only index and random splits supported.")
  splitter = splitters[split]

  logger.info("About to split dataset with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)
  transformers = []
  all_dataset = (train, valid, test)
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bbbc001_tasks, all_dataset, transformers


def load_bbbc002(split='index',
                 reload=True,
                 data_dir=None,
                 save_dir=None,
                 **kwargs):
  """Load BBBC002 dataset

  This dataset contains data corresponding to 5 samples of Drosophilia Kc167
  cells. There are 10 fields of view for each sample, each an image of size
  512x512. Ground truth labels contain cell counts for this dataset. Full
  details about this dataset are present at
  https://data.broadinstitute.org/bbbc/BBBC002/.
  """
  # Featurize BBBC002 dataset
  bbbc002_tasks = ["cell-count"]

  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if reload:
    save_folder = os.path.join(save_dir, "bbbc002-featurized", str(split))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bbbc002_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "BBBC002_v1_images.zip")
  labels_file = os.path.join(data_dir, "BBBC002_v1_counts.txt")

  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=BBBC2_IMAGE_URL, dest_dir=data_dir)
  if not os.path.exists(labels_file):
    deepchem.utils.download_url(url=BBBC2_LABEL_URL, dest_dir=data_dir)
  # Featurize Images into NumpyArrays
  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file, in_memory=False)

  # Load text file with labels
  with open(labels_file) as f:
    content = f.readlines()
  # Strip the first line which holds field labels
  lines = [x.strip() for x in content][1:]
  # Format is: Image_name count1 count2
  lines = [x.split("\t") for x in lines]
  counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
  y = np.reshape(np.array(counts), (len(counts), 1))
  ids = [x[0] for x in lines]

  # This is kludgy way to add y to dataset. Can be done better?
  dataset = deepchem.data.DiskDataset.from_numpy(dataset.X, y, ids=ids)

  if split == None:
    transformers = []
    logger.info("Split is None, no transformers used for the dataset.")
    return bbbc002_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  if split not in splitters:
    raise ValueError("Only index and random splits supported.")
  splitter = splitters[split]

  logger.info("About to split dataset with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)
  all_dataset = (train, valid, test)
  transformers = []
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bbbc002_tasks, all_dataset, transformers
+0 −106
Original line number Diff line number Diff line
"""
Blood-Brain Barrier Penetration dataset loader.
"""
import os
import logging
import deepchem

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
BBBP_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/BBBP.csv'


def load_bbbp(featurizer='ECFP',
              split='random',
              reload=True,
              data_dir=None,
              save_dir=None,
              **kwargs):
  """Load blood-brain barrier penetration datasets """
  # Featurize bbb dataset
  logger.info("About to featurize bbbp dataset.")
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  bbbp_tasks = ["p_np"]

  if reload:
    save_folder = os.path.join(save_dir, "bbbp-featurized", featurizer)
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_folder)
    if loaded:
      return bbbp_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "BBBP.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=BBBP_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=bbbp_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  if split is None:
    # Initialize transformers
    transformers = [
        deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]

    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return bbbp_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split data with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)

  # Initialize transformers
  transformers = [
      deepchem.trans.BalancingTransformer(transform_w=True, dataset=train)
  ]

  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bbbp_tasks, (train, valid, test), transformers
+0 −77
Original line number Diff line number Diff line
"""
Cell Counting Dataset.

Loads the cell counting dataset from
http://www.robots.ox.ac.uk/~vgg/research/counting/index_org.html. Labels aren't
available for this dataset, so only raw images are provided.
"""
import os
import logging
import deepchem

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
DATASET_URL = 'http://www.robots.ox.ac.uk/~vgg/research/counting/cells.zip'


def load_cell_counting(split=None,
                       reload=True,
                       data_dir=None,
                       save_dir=None,
                       **kwargs):
  """Load Cell Counting dataset.

  Loads the cell counting dataset from http://www.robots.ox.ac.uk/~vgg/research/counting/index_org.html.
  """
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR
  # No tasks since no labels provided.
  cell_counting_tasks = []
  # For now images are loaded directly by ImageLoader
  featurizer = ""
  if reload:
    save_folder = os.path.join(save_dir, "cell_counting-featurized", str(split))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_folder)
    if loaded:
      return cell_counting_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "cells.zip")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=DATASET_URL, dest_dir=data_dir)

  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file)

  transformers = []

  if split == None:
    logger.info("Split is None, no transformers used.")
    return cell_counting_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  if split not in splitters:
    raise ValueError("Only index and random splits supported.")
  splitter = splitters[split]

  logger.info("About to split dataset with {} splitter.".format(split))
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)
  transformers = []
  all_dataset = (train, valid, test)
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return cell_counting_tasks, all_dataset, transformers
Loading