Unverified Commit 1124ab6e authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1416 from VIGS25/code_changes

#630 - Add MolNet loaders for UV/Kinase/Factors datasets
parents f9510bee f7afbc25
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -28,6 +28,9 @@ from deepchem.molnet.load_function.sweetlead_datasets import load_sweet
from deepchem.molnet.load_function.tox21_datasets import load_tox21
from deepchem.molnet.load_function.toxcast_datasets import load_toxcast
from deepchem.molnet.load_function.uspto_datasets import load_uspto
from deepchem.molnet.load_function.uv_datasets import load_uv
from deepchem.molnet.load_function.factors_datasets import load_factors
from deepchem.molnet.load_function.kinase_datasets import load_kinase

from deepchem.molnet.dnasim import simulate_motif_density_localization
from deepchem.molnet.dnasim import simulate_motif_counting
+176 −0
Original line number Diff line number Diff line
"""
FACTOR dataset loader
"""

from __future__ import division
from __future__ import unicode_literals

import os
import logging
import time

import numpy as np
import deepchem
from deepchem.molnet.load_function.kaggle_features import merck_descriptors

logger = logging.getLogger(__name__)

TRAIN_URL = 'https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/FACTORS_training_disguised_combined_full.csv.gz'
VALID_URL = 'https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/FACTORS_test1_disguised_combined_full.csv.gz'
TEST_URL = 'https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/FACTORS_test2_disguised_combined_full.csv.gz'

TRAIN_FILENAME = "FACTORS_training_disguised_combined_full.csv.gz"
VALID_FILENAME = "FACTORS_test1_disguised_combined_full.csv.gz"
TEST_FILENAME = "FACTORS_test2_disguised_combined_full.csv.gz"


def remove_missing_entries(dataset):
  """Remove missing entries.

  Some of the datasets have missing entries that sneak in as zero'd out
  feature vectors. Get rid of them.
  """
  for i, (X, y, w, ids) in enumerate(dataset.itershards()):
    available_rows = X.any(axis=1)
    logger.info("Shard %d has %d missing entries." %
                (i, np.count_nonzero(~available_rows)))
    X = X[available_rows]
    y = y[available_rows]
    w = w[available_rows]
    ids = ids[available_rows]
    dataset.set_shard(i, X, y, w, ids)


def get_transformers(train_dataset):
  """Gets transformers applied to the dataset"""

  transformers = list()
  #TODO: Check if anything needs to be added

  return transformers


def gen_factors(FACTORS_tasks,
                data_dir,
                train_dir,
                valid_dir,
                test_dir,
                shard_size=2000):
  """Loads the FACTORS dataset; does not do train/test split"""

  time1 = time.time()

  train_files = os.path.join(data_dir, TRAIN_FILENAME)
  valid_files = os.path.join(data_dir, VALID_FILENAME)
  test_files = os.path.join(data_dir, TEST_FILENAME)

  if not os.path.exists(train_files):
    logger.info("Downloading train file...")
    deepchem.utils.download_url(url=TRAIN_URL, dest_dir=data_dir)
    logger.info("Training file download complete.")

    logger.info("Downloading validation file...")
    deepchem.utils.download_url(url=VALID_URL, dest_dir=data_dir)
    logger.info("Validation file download complete.")

    logger.info("Downloading test file...")
    deepchem.utils.download_url(url=TEST_URL, dest_dir=data_dir)
    logger.info("Test file download complete")

  # Featurize the FACTORS dataset
  logger.info("About to featurize the FACTORS dataset")
  featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors)
  loader = deepchem.data.UserCSVLoader(
      tasks=FACTORS_tasks, id_field="Molecule", featurizer=featurizer)

  logger.info("Featurizing the train dataset...")
  train_dataset = loader.featurize(train_files, shard_size=shard_size)

  logger.info("Featurizing the validation dataset...")
  valid_dataset = loader.featurize(valid_files, shard_size=shard_size)

  logger.info("Featurizing the test dataset...")
  test_dataset = loader.featurize(test_files, shard_size=shard_size)

  logger.info("Remove missing entries from dataset")
  remove_missing_entries(train_dataset)
  remove_missing_entries(valid_dataset)
  remove_missing_entries(test_dataset)

  # Shuffle the training data
  logger.info("Shuffling the training dataset")
  train_dataset.sparse_shuffle()

  # Apply transformations
  logger.info("Transforming datasets with transformers")
  transformers = get_transformers(train_dataset)

  for transformer in transformers:
    logger.info("Performing transformations with {}".format(
        transformer.__class__.__name__))

    logger.info("Transforming the training dataset...")
    train_dataset = transformer.transform(train_dataset)

    logger.info("Transforming the validation dataset...")
    valid_dataset = transformer.transform(valid_dataset)

    logger.info("Transforming the test dataset...")
    test_dataset = transformer.transform(test_dataset)

  logger.info("Transformations complete.")
  logger.info("Moving datasets to corresponding directories")

  train_dataset.move(train_dir)
  logger.info("Train dataset moved.")

  valid_dataset.move(valid_dir)
  logger.info("Validation dataset moved.")

  test_dataset.move(test_dir)
  logger.info("Test dataset moved.")

  time2 = time.time()

  ########## TIMING ################
  logger.info("TIMING: FACTORS fitting took %0.3f s" % (time2 - time1))

  return train_dataset, valid_dataset, test_dataset


def load_factors(shard_size=2000, featurizer=None, split=None, reload=True):
  """Loads FACTOR dataset; does not do train/test split"""

  FACTORS_tasks = [
      'T_00001', 'T_00002', 'T_00003', 'T_00004', 'T_00005', 'T_00006',
      'T_00007', 'T_00008', 'T_00009', 'T_00010', 'T_00011', 'T_00012'
  ]

  data_dir = deepchem.utils.get_data_dir()
  data_dir = os.path.join(data_dir, "factors")

  if not os.path.exists(data_dir):
    os.mkdir(data_dir)

  train_dir = os.path.join(data_dir, "train_dir")
  valid_dir = os.path.join(data_dir, "valid_dir")
  test_dir = os.path.join(data_dir, "test_dir")

  if (os.path.exists(train_dir) and os.path.exists(valid_dir) and
      os.path.exists(test_dir)):

    logger.info("Reloading existing datasets")
    train_dataset = deepchem.data.DiskDataset(train_dir)
    valid_dataset = deepchem.data.DiskDataset(valid_dir)
    test_dataset = deepchem.data.DiskDataset(test_dir)

  else:
    logger.info("Featurizing datasets")
    train_dataset, valid_dataset, test_dataset = \
    gen_factors(FACTORS_tasks=FACTORS_tasks, data_dir=data_dir, train_dir=train_dir,
                valid_dir=valid_dir, test_dir=test_dir, shard_size=shard_size)

  transformers = get_transformers(train_dataset)

  return FACTORS_tasks, (train_dataset, valid_dataset,
                         test_dataset), transformers
+199 −0
Original line number Diff line number Diff line
"""
KINASE dataset loader
"""

from __future__ import division
from __future__ import unicode_literals

import os
import logging
import time

import numpy as np
import deepchem
from deepchem.molnet.load_function.kaggle_features import merck_descriptors

TRAIN_URL = 'https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/KINASE_training_disguised_combined_full.csv.gz'
VALID_URL = 'https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/KINASE_test1_disguised_combined_full.csv.gz'
TEST_URL = 'https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/KINASE_test2_disguised_combined_full.csv.gz'

TRAIN_FILENAME = "KINASE_training_disguised_combined_full.csv.gz"
VALID_FILENAME = "KINASE_test1_disguised_combined_full.csv.gz"
TEST_FILENAME = "KINASE_test2_disguised_combined_full.csv.gz"

logger = logging.getLogger(__name__)


def remove_missing_entries(dataset):
  """Remove missing entries.

  Some of the datasets have missing entries that sneak in as zero'd out
  feature vectors. Get rid of them.
  """
  for i, (X, y, w, ids) in enumerate(dataset.itershards()):
    available_rows = X.any(axis=1)
    logger.info("Shard %d has %d missing entries." %
                (i, np.count_nonzero(~available_rows)))
    X = X[available_rows]
    y = y[available_rows]
    w = w[available_rows]
    ids = ids[available_rows]
    dataset.set_shard(i, X, y, w, ids)


def get_transformers(train_dataset):
  """Gets transformers applied to the dataset"""
  #TODO: Check for this

  transformers = list()

  return transformers


def gen_kinase(KINASE_tasks,
               train_dir,
               valid_dir,
               test_dir,
               data_dir,
               shard_size=2000):

  time1 = time.time()

  train_files = os.path.join(data_dir, TRAIN_FILENAME)
  valid_files = os.path.join(data_dir, VALID_FILENAME)
  test_files = os.path.join(data_dir, TEST_FILENAME)

  # Download files if they don't exist

  if not os.path.exists(train_files):

    logger.info("Downloading training file...")
    deepchem.utils.download_url(url=TRAIN_URL, dest_dir=data_dir)
    logger.info("Training file download complete.")

    logger.info("Downloading validation file...")
    deepchem.utils.download_url(url=VALID_URL, dest_dir=data_dir)
    logger.info("Validation file download complete.")

    logger.info("Downloading test file...")
    deepchem.utils.download_url(url=TEST_URL, dest_dir=data_dir)
    logger.info("Test file download complete")

  # Featurize the KINASE dataset
  logger.info("About to featurize KINASE dataset.")
  featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors)

  loader = deepchem.data.UserCSVLoader(
      tasks=KINASE_tasks, id_field="Molecule", featurizer=featurizer)

  logger.info("Featurizing train datasets...")
  train_dataset = loader.featurize(
      input_files=train_files, shard_size=shard_size)

  logger.info("Featurizing validation datasets...")
  valid_dataset = loader.featurize(
      input_files=valid_files, shard_size=shard_size)

  logger.info("Featurizing test datasets....")
  test_dataset = loader.featurize(input_files=test_files, shard_size=shard_size)

  logger.info("Remove missing entries from dataset")
  remove_missing_entries(train_dataset)
  remove_missing_entries(valid_dataset)
  remove_missing_entries(test_dataset)

  # Shuffle the training data
  logger.info("Shuffling the training dataset")
  train_dataset.sparse_shuffle()

  # Apply transformations
  logger.info("Transformating datasets with transformers")
  transformers = get_transformers(train_dataset)

  for transformer in transformers:
    logger.info("Performing transformations with {}".format(
        transformer.__class__.__name__))

    logger.info("Transforming the training dataset...")
    train_dataset = transformer.transform(train_dataset)

    logger.info("Transforming the validation dataset...")
    valid_dataset = transformer.transform(valid_dataset)

    logger.info("Transforming the test dataset...")
    test_dataset = transformer.transform(test_dataset)

  logger.info("Transformations complete.")
  logger.info("Moving datasets to corresponding directories")

  train_dataset.move(train_dir)
  logger.info("Train dataset moved.")

  valid_dataset.move(valid_dir)
  logger.info("Validation dataset moved.")

  test_dataset.move(test_dir)
  logger.info("Test dataset moved.")

  time2 = time.time()

  ##### TIMING ######

  logger.info("TIMING: KINASE fitting took %0.3f s" % (time2 - time1))

  return train_dataset, valid_dataset, test_dataset


def load_kinase(shard_size=2000, featurizer=None, split=None, reload=True):

  "Loads kinase datasets, does not do train/test split"

  KINASE_tasks = [
      'T_00013', 'T_00014', 'T_00015', 'T_00016', 'T_00017', 'T_00018',
      'T_00019', 'T_00020', 'T_00021', 'T_00022', 'T_00023', 'T_00024',
      'T_00025', 'T_00026', 'T_00027', 'T_00028', 'T_00029', 'T_00030',
      'T_00031', 'T_00032', 'T_00033', 'T_00034', 'T_00035', 'T_00036',
      'T_00037', 'T_00038', 'T_00039', 'T_00040', 'T_00041', 'T_00042',
      'T_00043', 'T_00044', 'T_00045', 'T_00046', 'T_00047', 'T_00048',
      'T_00049', 'T_00050', 'T_00051', 'T_00052', 'T_00053', 'T_00054',
      'T_00055', 'T_00056', 'T_00057', 'T_00058', 'T_00059', 'T_00060',
      'T_00061', 'T_00062', 'T_00063', 'T_00064', 'T_00065', 'T_00066',
      'T_00067', 'T_00068', 'T_00069', 'T_00070', 'T_00071', 'T_00072',
      'T_00073', 'T_00074', 'T_00075', 'T_00076', 'T_00077', 'T_00078',
      'T_00079', 'T_00080', 'T_00081', 'T_00082', 'T_00083', 'T_00084',
      'T_00085', 'T_00086', 'T_00087', 'T_00088', 'T_00089', 'T_00090',
      'T_00091', 'T_00092', 'T_00093', 'T_00094', 'T_00095', 'T_00096',
      'T_00097', 'T_00098', 'T_00099', 'T_00100', 'T_00101', 'T_00102',
      'T_00103', 'T_00104', 'T_00105', 'T_00106', 'T_00107', 'T_00108',
      'T_00109', 'T_00110', 'T_00111'
  ]

  data_dir = deepchem.utils.get_data_dir()
  data_dir = os.path.join(data_dir, "kinase")

  if not os.path.exists(data_dir):
    os.mkdir(data_dir)

  train_dir = os.path.join(data_dir, "train_dir")
  valid_dir = os.path.join(data_dir, "valid_dir")
  test_dir = os.path.join(data_dir, "test_dir")

  if (os.path.exists(train_dir) and os.path.exists(valid_dir) and
      os.path.exists(test_dir)):

    logger.info("Reloading existing datasets")
    train_dataset = deepchem.data.DiskDataset(train_dir)
    valid_dataset = deepchem.data.DiskDataset(valid_dir)
    test_dataset = deepchem.data.DiskDataset(test_dir)

  else:
    logger.info("Featurizing datasets")
    train_dataset, valid_dataset, test_dataset = \
    gen_kinase(KINASE_tasks=KINASE_tasks, train_dir=train_dir,
               valid_dir=valid_dir, test_dir=test_dir, data_dir=data_dir,
               shard_size=shard_size)

  transformers = get_transformers(train_dataset)

  return KINASE_tasks, (train_dataset, valid_dataset,
                        test_dataset), transformers
+172 −0
Original line number Diff line number Diff line
"""
UV Dataset loader
"""

from __future__ import division
from __future__ import unicode_literals

import os
import logging
import time

import numpy as np
import deepchem
from deepchem.molnet.load_function.kaggle_features import merck_descriptors
from deepchem.molnet.load_function.uv_tasks import UV_tasks

logger = logging.getLogger(__name__)

TRAIN_URL = 'https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/UV_training_disguised_combined_full.csv.gz'
VALID_URL = 'https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/UV_test1_disguised_combined_full.csv.gz'
TEST_URL = 'https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/UV_test2_disguised_combined_full.csv.gz'

TRAIN_FILENAME = "UV_training_disguised_combined_full.csv.gz"
VALID_FILENAME = "UV_test1_disguised_combined_full.csv.gz"
TEST_FILENAME = "UV_test2_disguised_combined_full.csv.gz"


def remove_missing_entries(dataset):
  """Remove missing entries.

  Some of the datasets have missing entries that sneak in as zero'd out
  feature vectors. Get rid of them.
  """
  for i, (X, y, w, ids) in enumerate(dataset.itershards()):
    available_rows = X.any(axis=1)
    logger.info("Shard %d has %d missing entries." %
                (i, np.count_nonzero(~available_rows)))
    X = X[available_rows]
    y = y[available_rows]
    w = w[available_rows]
    ids = ids[available_rows]
    dataset.set_shard(i, X, y, w, ids)


def get_transformers(train_dataset):

  "Gets transformations applied on the dataset"

  transformers = list()

  return transformers


def gen_uv(UV_tasks, data_dir, train_dir, valid_dir, test_dir, shard_size=2000):
  """Loading the UV dataset; does not do train/test split"""

  time1 = time.time()

  train_files = os.path.join(data_dir, TRAIN_FILENAME)
  valid_files = os.path.join(data_dir, VALID_FILENAME)
  test_files = os.path.join(data_dir, TEST_FILENAME)

  # Download files if they don't exist

  if not os.path.exists(train_files):

    logger.info("Downloading training file...")
    deepchem.utils.download_url(url=TRAIN_URL, dest_dir=data_dir)
    logger.info("Training file download complete.")

    logger.info("Downloading validation file...")
    deepchem.utils.download_url(url=VALID_URL, dest_dir=data_dir)
    logger.info("Validation file download complete.")

    logger.info("Downloading test file...")
    deepchem.utils.download_url(url=TEST_URL, dest_dir=data_dir)
    logger.info("Test file download complete")

  # Featurizing datasets
  logger.info("About to featurize UV dataset.")
  featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors)
  loader = deepchem.data.UserCSVLoader(
      tasks=UV_tasks, id_field="Molecule", featurizer=featurizer)

  logger.info("Featurizing train datasets...")
  train_dataset = loader.featurize(
      input_files=train_files, shard_size=shard_size)

  logger.info("Featurizing validation datasets...")
  valid_dataset = loader.featurize(
      input_files=valid_files, shard_size=shard_size)

  logger.info("Featurizing test datasets....")
  test_dataset = loader.featurize(input_files=test_files, shard_size=shard_size)

  # Missing entry removal
  logger.info("Removing missing entries from dataset.")
  remove_missing_entries(train_dataset)
  remove_missing_entries(valid_dataset)
  remove_missing_entries(test_dataset)

  # Shuffle the training data
  logger.info("Shuffling the training dataset")
  train_dataset.sparse_shuffle()

  # Apply transformations
  logger.info("Starting transformations")
  transformers = get_transformers(train_dataset)

  for transformer in transformers:
    logger.info("Performing transformations with {}".format(
        transformer.__class__.__name__))

    logger.info("Transforming the training dataset...")
    train_dataset = transformer.transform(train_dataset)

    logger.info("Transforming the validation dataset...")
    valid_dataset = transformer.transform(valid_dataset)

    logger.info("Transforming the test dataset...")
    test_dataset = transformer.transform(test_dataset)

  logger.info("Transformations complete.")
  logger.info("Moving datasets to corresponding directories")

  train_dataset.move(train_dir)
  logger.info("Train dataset moved.")

  valid_dataset.move(valid_dir)
  logger.info("Validation dataset moved.")

  test_dataset.move(test_dir)
  logger.info("Test dataset moved.")

  time2 = time.time()

  ##### TIMING ###########
  logger.info("TIMING: UV fitting took %0.3f s" % (time2 - time1))

  return train_dataset, valid_dataset, test_dataset


def load_uv(shard_size=2000, featurizer=None, split=None, reload=True):
  """Load UV dataset; does not do train/test split"""

  data_dir = deepchem.utils.get_data_dir()
  data_dir = os.path.join(data_dir, "UV")

  if not os.path.exists(data_dir):
    os.mkdir(data_dir)

  train_dir = os.path.join(data_dir, "train_dir")
  valid_dir = os.path.join(data_dir, "valid_dir")
  test_dir = os.path.join(data_dir, "test_dir")

  if (os.path.exists(train_dir) and os.path.exists(valid_dir) and
      os.path.exists(test_dir)):

    logger.info("Reloading existing datasets")
    train_dataset = deepchem.data.DiskDataset(train_dir)
    valid_dataset = deepchem.data.DiskDataset(valid_dir)
    test_dataset = deepchem.data.DiskDataset(test_dir)

  else:
    logger.info("Featurizing datasets")
    train_dataset, valid_dataset, test_dataset = \
    gen_uv(UV_tasks=UV_tasks, data_dir=data_dir, train_dir=train_dir,
           valid_dir=valid_dir, test_dir=test_dir, shard_size=shard_size)

  transformers = get_transformers(train_dataset)

  return UV_tasks, (train_dataset, valid_dataset, test_dataset), transformers
+30 −0
Original line number Diff line number Diff line
UV_tasks = [
    'w__210', 'w__211', 'w__212', 'w__213', 'w__214', 'w__215', 'w__216',
    'w__217', 'w__218', 'w__219', 'w__220', 'w__221', 'w__222', 'w__223',
    'w__224', 'w__225', 'w__226', 'w__227', 'w__228', 'w__229', 'w__230',
    'w__231', 'w__232', 'w__233', 'w__234', 'w__235', 'w__236', 'w__237',
    'w__238', 'w__239', 'w__240', 'w__241', 'w__242', 'w__243', 'w__244',
    'w__245', 'w__246', 'w__247', 'w__248', 'w__249', 'w__250', 'w__251',
    'w__252', 'w__253', 'w__254', 'w__255', 'w__256', 'w__257', 'w__258',
    'w__259', 'w__260', 'w__261', 'w__262', 'w__263', 'w__264', 'w__265',
    'w__266', 'w__267', 'w__268', 'w__269', 'w__270', 'w__271', 'w__272',
    'w__273', 'w__274', 'w__275', 'w__276', 'w__277', 'w__278', 'w__279',
    'w__280', 'w__281', 'w__282', 'w__283', 'w__284', 'w__285', 'w__286',
    'w__287', 'w__288', 'w__289', 'w__290', 'w__291', 'w__292', 'w__293',
    'w__294', 'w__295', 'w__296', 'w__297', 'w__298', 'w__299', 'w__300',
    'w__301', 'w__302', 'w__303', 'w__304', 'w__305', 'w__306', 'w__307',
    'w__308', 'w__309', 'w__310', 'w__311', 'w__312', 'w__313', 'w__314',
    'w__315', 'w__316', 'w__317', 'w__318', 'w__319', 'w__320', 'w__321',
    'w__322', 'w__323', 'w__324', 'w__325', 'w__326', 'w__327', 'w__328',
    'w__329', 'w__330', 'w__331', 'w__332', 'w__333', 'w__334', 'w__335',
    'w__336', 'w__337', 'w__338', 'w__339', 'w__340', 'w__341', 'w__342',
    'w__343', 'w__344', 'w__345', 'w__346', 'w__347', 'w__348', 'w__349',
    'w__350', 'w__351', 'w__352', 'w__353', 'w__354', 'w__355', 'w__356',
    'w__357', 'w__358', 'w__359', 'w__360', 'w__361', 'w__362', 'w__363',
    'w__364', 'w__365', 'w__366', 'w__367', 'w__368', 'w__369', 'w__370',
    'w__371', 'w__372', 'w__373', 'w__374', 'w__375', 'w__376', 'w__377',
    'w__378', 'w__379', 'w__380', 'w__381', 'w__382', 'w__383', 'w__384',
    'w__385', 'w__386', 'w__387', 'w__388', 'w__389', 'w__390', 'w__391',
    'w__392', 'w__393', 'w__394', 'w__395', 'w__396', 'w__397', 'w__398',
    'w__399', 'w__400'
]
Loading