Unverified Commit 8d94661c authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1649 from VIGS25/smiles-to-image-molnet

Custom directories and SmilesToImage for MolNet
parents b631acb3 5256cbc5
Loading
Loading
Loading
Loading
+61 −29
Original line number Diff line number Diff line
@@ -11,36 +11,48 @@ from deepchem.molnet.load_function.bace_features import bace_user_specified_feat

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
BACE_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv'


def load_bace_regression(featurizer='ECFP',
                         split='random',
                         reload=True,
                         move_mean=True):
                         move_mean=True,
                         data_dir=None,
                         save_dir=None,
                         **kwargs):
  """Load bace datasets."""
  # Featurize bace dataset
  logger.info("About to featurize bace dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  bace_tasks = ["pIC50"]

  if reload:
    if move_mean:
      dir_name = "bace_r/" + featurizer + "/" + str(split)
    save_folder = os.path.join(save_dir, "bace_r-featurized")
    if not move_mean:
      save_folder = os.path.join(save_folder, str(featurizer) + "_mean_unmoved")
    else:
      dir_name = "bace_r/" + featurizer + "_mean_unmoved/" + str(split)
    save_dir = os.path.join(data_dir, dir_name)

  dataset_file = os.path.join(data_dir, "bace.csv")
      save_folder = os.path.join(save_folder, str(featurizer))

  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv'
    )
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

  bace_tasks = ["pIC50"]
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
        save_folder)
    if loaded:
      return bace_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "bace.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=BACE_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
@@ -52,6 +64,11 @@ def load_bace_regression(featurizer='ECFP',
  elif featurizer == 'UserDefined':
    featurizer = deepchem.feat.UserDefinedFeaturizer(
        bace_user_specified_features)
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, smiles_field="mol", featurizer=featurizer)
@@ -91,33 +108,43 @@ def load_bace_regression(featurizer='ECFP',
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bace_tasks, (train, valid, test), transformers


def load_bace_classification(featurizer='ECFP', split='random', reload=True):
def load_bace_classification(featurizer='ECFP',
                             split='random',
                             reload=True,
                             data_dir=None,
                             save_dir=None,
                             **kwargs):
  """Load bace datasets."""
  # Featurize bace dataset
  logger.info("About to featurize bace dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "bace_c/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "bace.csv")

  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv'
    )
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  bace_tasks = ["Class"]

  if reload:
    save_folder = os.path.join(save_dir, "bace_c-featurized", str(featurizer))
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
        save_folder)
    if loaded:
      return bace_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "bace.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=BACE_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
@@ -129,6 +156,11 @@ def load_bace_classification(featurizer='ECFP', split='random', reload=True):
  elif featurizer == 'UserDefined':
    featurizer = deepchem.feat.UserDefinedFeaturizer(
        bace_user_specified_features)
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=bace_tasks, smiles_field="mol", featurizer=featurizer)
@@ -168,6 +200,6 @@ def load_bace_classification(featurizer='ECFP', split='random', reload=True):
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bace_tasks, (train, valid, test), transformers
+3 −2
Original line number Diff line number Diff line
@@ -196,8 +196,9 @@ bace_user_specified_features = [
    'path/walk 4 - Randic shape index (RSIpw4)',
    'path/walk 5 - Randic shape index (RSIpw5)',
    'E-state topological parameter (ETP)', 'Ring Count 3 (RNGCNT3)',
    'Ring Count 4 (RNGCNT4)', 'Ring Count 5 (RNGCNT5)', 'Ring Count 6 (RNGCNT6)',
    'Ring Count 7 (RNGCNT7)', 'Ring Count 8 (RNGCNT8)', 'Ring Count 9 (RNGCNT9)',
    'Ring Count 4 (RNGCNT4)', 'Ring Count 5 (RNGCNT5)',
    'Ring Count 6 (RNGCNT6)', 'Ring Count 7 (RNGCNT7)',
    'Ring Count 8 (RNGCNT8)', 'Ring Count 9 (RNGCNT9)',
    'Ring Count 10 (RNGCNT10)', 'Ring Count 11 (RNGCNT11)',
    'Ring Count 12 (RNGCNT12)', 'Ring Count 13 (RNGCNT13)',
    'Ring Count 14 (RNGCNT14)', 'Ring Count 15 (RNGCNT15)',
+43 −20
Original line number Diff line number Diff line
@@ -14,31 +14,47 @@ import deepchem

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
BBBC1_IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_images_tif.zip'
BBBC1_LABEL_URL = 'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_counts.txt'

def load_bbbc001(split='index', reload=True):
BBBC2_IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_images.zip'
BBBC2_LABEL_URL = 'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_counts.txt'


def load_bbbc001(split='index',
                 reload=True,
                 data_dir=None,
                 save_dir=None,
                 **kwargs):
  """Load BBBC001 dataset

  This dataset contains 6 images of human HT29 colon cancer cells. The task is to learn to predict the cell counts in these images. This dataset is too small to serve to train algorithms, but might serve as a good test dataset. https://data.broadinstitute.org/bbbc/BBBC001/
  This dataset contains 6 images of human HT29 colon cancer cells. The task is
  to learn to predict the cell counts in these images. This dataset is too small
   to serve to train algorithms, but might serve as a good test dataset.
   https://data.broadinstitute.org/bbbc/BBBC001/
  """
  # Featurize BBBC001 dataset
  bbbc001_tasks = ["cell-count"]
  data_dir = deepchem.utils.get_data_dir()

  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if reload:
    save_dir = os.path.join(data_dir, "bbbc001/" + str(split))
    save_folder = os.path.join(save_dir, "bbbc001-featurized/" + str(split))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
        save_folder)
    if loaded:
      return bbbc001_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "BBBC001_v1_images_tif.zip")
  labels_file = os.path.join(data_dir, "BBBC001_v1_counts.txt")

  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_images_tif.zip'
    )
    deepchem.utils.download_url(url=BBBC1_IMAGE_URL, dest_dir=data_dir)
  if not os.path.exists(labels_file):
    deepchem.utils.download_url(
        'https://data.broadinstitute.org/bbbc/BBBC001/BBBC001_v1_counts.txt')
    deepchem.utils.download_url(url=BBBC1_LABEL_URL, dest_dir=data_dir)
  # Featurize Images into NumpyArrays
  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file, in_memory=False)
@@ -74,12 +90,16 @@ def load_bbbc001(split='index', reload=True):
  transformers = []
  all_dataset = (train, valid, test)
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bbbc001_tasks, all_dataset, transformers


def load_bbbc002(split='index', reload=True):
def load_bbbc002(split='index',
                 reload=True,
                 data_dir=None,
                 save_dir=None,
                 **kwargs):
  """Load BBBC002 dataset

  This dataset contains data corresponding to 5 samples of Drosophilia Kc167
@@ -90,22 +110,25 @@ def load_bbbc002(split='index', reload=True):
  """
  # Featurize BBBC002 dataset
  bbbc002_tasks = ["cell-count"]
  data_dir = deepchem.utils.get_data_dir()

  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if reload:
    save_dir = os.path.join(data_dir, "bbbc002/" + str(split))
    save_folder = os.path.join(save_dir, "bbbc002-featurized/" + str(split))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
        save_folder)
    if loaded:
      return bbbc002_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "BBBC002_v1_images.zip")
  labels_file = os.path.join(data_dir, "BBBC002_v1_counts.txt")

  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_images.zip')
    deepchem.utils.download_url(url=BBBC2_IMAGE_URL, dest_dir=data_dir)
  if not os.path.exists(labels_file):
    deepchem.utils.download_url(
        'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_counts.txt')
    deepchem.utils.download_url(url=BBBC2_LABEL_URL, dest_dir=data_dir)
  # Featurize Images into NumpyArrays
  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file, in_memory=False)
@@ -142,6 +165,6 @@ def load_bbbc002(split='index', reload=True):
  all_dataset = (train, valid, test)
  transformers = []
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bbbc002_tasks, all_dataset, transformers
+30 −12
Original line number Diff line number Diff line
@@ -10,29 +10,42 @@ import deepchem

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
BBBP_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/BBBP.csv'

def load_bbbp(featurizer='ECFP', split='random', reload=True):

def load_bbbp(featurizer='ECFP',
              split='random',
              reload=True,
              data_dir=None,
              save_dir=None,
              **kwargs):
  """Load blood-brain barrier penetration datasets """
  # Featurize bbb dataset
  logger.info("About to featurize bbbp dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "bbbp/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "BBBP.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/BBBP.csv'
    )
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  bbbp_tasks = ["p_np"]

  if reload:
    save_folder = os.path.join(save_dir, "bbbp-featurized", featurizer)
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
        save_folder)
    if loaded:
      return bbbp_tasks, all_dataset, transformers

  dataset_file = os.path.join(data_dir, "BBBP.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=BBBP_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
@@ -41,6 +54,11 @@ def load_bbbp(featurizer='ECFP', split='random', reload=True):
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)

  loader = deepchem.data.CSVLoader(
      tasks=bbbp_tasks, smiles_field="smiles", featurizer=featurizer)
@@ -78,6 +96,6 @@ def load_bbbp(featurizer='ECFP', split='random', reload=True):
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return bbbp_tasks, (train, valid, test), transformers
+17 −8
Original line number Diff line number Diff line
@@ -14,28 +14,37 @@ import deepchem

logger = logging.getLogger(__name__)

DEFAULT_DIR = deepchem.utils.get_data_dir()
DATASET_URL = 'http://www.robots.ox.ac.uk/~vgg/research/counting/cells.zip'

def load_cell_counting(split=None, reload=True):

def load_cell_counting(split=None,
                       reload=True,
                       data_dir=None,
                       save_dir=None,
                       **kwargs):
  """Load Cell Counting dataset.

  Loads the cell counting dataset from http://www.robots.ox.ac.uk/~vgg/research/counting/index_org.html.
  """
  data_dir = deepchem.utils.get_data_dir()
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR
  # No tasks since no labels provided.
  cell_counting_tasks = []
  # For now images are loaded directly by ImageLoader
  featurizer = ""
  if reload:
    save_dir = os.path.join(data_dir,
                            "cell_counting/" + featurizer + "/" + str(split))
    save_folder = os.path.join(save_dir,
                               "cell_counting-featurized/" + str(split))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
        save_folder)
    if loaded:
      return cell_counting_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "cells.zip")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://www.robots.ox.ac.uk/~vgg/research/counting/cells.zip')
    deepchem.utils.download_url(url=DATASET_URL, dest_dir=data_dir)

  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file)
@@ -59,6 +68,6 @@ def load_cell_counting(split=None, reload=True):
  transformers = []
  all_dataset = (train, valid, test)
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
    deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                             transformers)
  return cell_counting_tasks, all_dataset, transformers
Loading