Commit 73f25432 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Updating examples

parent 2c87c060
Loading
Loading
Loading
Loading
+25 −7
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer

def load_muv(base_dir, reload=True):
def load_muv(base_dir, reload=True, frac_train=.8):
  """Load MUV datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
@@ -32,6 +32,8 @@ def load_muv(base_dir, reload=True):
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")

  # Load MUV dataset
  print("About to load MUV dataset.")
@@ -44,12 +46,12 @@ def load_muv(base_dir, reload=True):
  # Featurize MUV dataset
  print("About to featurize MUV dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
  MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                      'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                      'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                      'MUV-466', 'MUV-832'])

  loader = DataLoader(tasks=all_MUV_tasks,
  loader = DataLoader(tasks=MUV_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
@@ -67,4 +69,20 @@ def load_muv(base_dir, reload=True):
    for transformer in transformers:
        transformer.transform(dataset)

  return all_MUV_tasks, dataset, transformers
  X, y, w, ids = dataset.to_numpy()
  num_tasks = 17
  num_train = frac_train * len(dataset)
  MUV_tasks = MUV_tasks[:num_tasks]
  print("Using following tasks")
  print(MUV_tasks)
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, MUV_tasks)
  valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, MUV_tasks)
  
  return MUV_tasks, (train_dataset, valid_dataset), transformers
+4 −2
Original line number Diff line number Diff line
@@ -17,7 +17,8 @@ from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import NormalizationTransformer

def load_nci(base_dir, reload=True, force_transform=False):
def load_nci(base_dir, reload=True, force_transform=False,
             shard_size=1000, num_shards_per_batch=4):
  """Load NCI datasets. Does not do train/test split"""
  # Set some global variables up top
  verbosity = "high"
@@ -70,7 +71,8 @@ def load_nci(base_dir, reload=True, force_transform=False):
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_paths, data_dir)
    dataset = loader.featurize(dataset_paths, data_dir, shard_size=shard_size,
                               num_shards_per_batch=num_shards_per_batch)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
+24 −4
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer

def load_pcba(base_dir, reload=True):
def load_pcba(base_dir, reload=True, frac_train=.8):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
@@ -31,6 +31,8 @@ def load_pcba(base_dir, reload=True):
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")

  # Load PCBA dataset
  print("About to load PCBA dataset.")
@@ -43,7 +45,7 @@ def load_pcba(base_dir, reload=True):
  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_PCBA_tasks = [
  PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
      'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
@@ -70,7 +72,7 @@ def load_pcba(base_dir, reload=True):
      'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
      'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  loader = DataLoader(tasks=all_PCBA_tasks,
  loader = DataLoader(tasks=PCBA_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
@@ -89,4 +91,22 @@ def load_pcba(base_dir, reload=True):
    for transformer in transformers:
        transformer.transform(dataset)

  return all_PCBA_tasks, dataset, transformers
  print("About to perform train/valid/test split.")
  num_train = frac_train * len(dataset)
  X, y, w, ids = dataset.to_numpy()
  num_tasks = 120
  PCBA_tasks = PCBA_tasks[:num_tasks]
  print("Using following tasks")
  print(PCBA_tasks)
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, PCBA_tasks)
  valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, PCBA_tasks)

  
  return PCBA_tasks, dataset, transformers
+20 −9
Original line number Diff line number Diff line
@@ -14,12 +14,11 @@ from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.datasets import Dataset
from deepchem.transformers import BalancingTransformer

def load_tox21(base_dir, reload=True):
def load_tox21(base_dir, reload=True, num_train=7200):
  """Load Tox21 datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
@@ -30,8 +29,9 @@ def load_tox21(base_dir, reload=True):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train")
  valid_dir = os.path.join(base_dir, "valid")

  # Load Tox21 dataset
  print("About to load Tox21 dataset.")
@@ -44,19 +44,19 @@ def load_tox21(base_dir, reload=True):
  # Featurize Tox21 dataset
  print("About to featurize Tox21 dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
  tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
                 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
                 'SR-HSE', 'SR-MMP', 'SR-p53']

  if not reload or not os.path.exists(data_dir):
    loader = DataLoader(tasks=all_tox21_tasks,
    loader = DataLoader(tasks=tox21_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(
        dataset_file, data_dir, shard_size=8192)
  else:
    dataset = Dataset(data_dir, all_tox21_tasks, reload=True)
    dataset = Dataset(data_dir, tox21_tasks, reload=True)

  # Initialize transformers 
  transformers = [
@@ -66,4 +66,15 @@ def load_tox21(base_dir, reload=True):
    for transformer in transformers:
        transformer.transform(dataset)

  return all_tox21_tasks, dataset, transformers
  X, y, w, ids = dataset.to_numpy()
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train], y[num_train:]
  w_train, w_valid = w[:num_train], w[num_train:]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, tox21_tasks)
  valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, tox21_tasks)
  
  return tox21_tasks, (train_dataset, valid_dataset), transformers
+1 −1
Original line number Diff line number Diff line
@@ -71,7 +71,7 @@ class HyperparamOpt(object):
        model_dir = tempfile.mkdtemp()

      model = self.model_class(model_params, model_dir)
      model.fit(train_dataset)
      model.fit(train_dataset, **model_params)
      model.save()
    
      evaluator = Evaluator(model, valid_dataset, output_transformers)
Loading