Commit c32a4d83 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #233 from rbharath/model_refactor

Model Refactoring
parents 8dcddcdd 7c3cb086
Loading
Loading
Loading
Loading
+17 −11
Original line number Diff line number Diff line
@@ -114,7 +114,8 @@ class Dataset(object):
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
  """
  def __init__(self, data_dir=None, tasks=[], metadata_rows=None, #featurizers=None, 
               raw_data=None, verbosity=None, reload=False):
               raw_data=None, verbosity=None, reload=False,
               compute_feature_statistics=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
@@ -132,7 +133,9 @@ class Dataset(object):
        metadata_rows = []
        ids, X, y, w = raw_data
        metadata_rows.append(
            Dataset.write_data_to_disk(self.data_dir, "data", tasks, X, y, w, ids))
            Dataset.write_data_to_disk(
                self.data_dir, "data", tasks, X, y, w, ids,
                compute_feature_statistics=compute_feature_statistics))
        self.metadata_df = Dataset.construct_metadata(metadata_rows)
        self.save_to_disk()
      else:
@@ -153,7 +156,7 @@ class Dataset(object):
  @staticmethod
  def write_dataframe(val, data_dir, featurizer=None, tasks=None,
                      raw_data=None, basename=None, mol_id_field="mol_id",
                      verbosity=None):
                      verbosity=None, compute_feature_statistics=None):
    """Writes data from dataframe to disk."""
    if featurizer is not None and tasks is not None:
      feature_type = featurizer.__class__.__name__
@@ -161,6 +164,7 @@ class Dataset(object):
      # TODO(rbharath): This is a hack. clean up.
      if not len(df):
        return None
      if compute_feature_statistics is None:
        if hasattr(featurizer, "dtype"):
          dtype = featurizer.dtype
          compute_feature_statistics = False
@@ -386,7 +390,8 @@ class Dataset(object):
    self.save_to_disk()

  @staticmethod
  def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None, verbosity=None):
  def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None, verbosity=None,
                 compute_feature_statistics=True):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
@@ -402,7 +407,8 @@ class Dataset(object):
      tasks = np.arange(n_tasks)
    raw_data = (ids, X, y, w)
    return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data,
                   verbosity=verbosity)
                   verbosity=verbosity,
                   compute_feature_statistics=compute_feature_statistics)

  @staticmethod
  def merge(merge_dir, datasets):
+25 −7
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer

def load_muv(base_dir, reload=True):
def load_muv(base_dir, reload=True, frac_train=.8):
  """Load MUV datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
@@ -32,6 +32,8 @@ def load_muv(base_dir, reload=True):
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")

  # Load MUV dataset
  print("About to load MUV dataset.")
@@ -44,12 +46,12 @@ def load_muv(base_dir, reload=True):
  # Featurize MUV dataset
  print("About to featurize MUV dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
  MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                      'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                      'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                      'MUV-466', 'MUV-832'])

  loader = DataLoader(tasks=all_MUV_tasks,
  loader = DataLoader(tasks=MUV_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
@@ -67,4 +69,20 @@ def load_muv(base_dir, reload=True):
    for transformer in transformers:
        transformer.transform(dataset)

  return all_MUV_tasks, dataset, transformers
  X, y, w, ids = dataset.to_numpy()
  num_tasks = 17
  num_train = frac_train * len(dataset)
  MUV_tasks = MUV_tasks[:num_tasks]
  print("Using following tasks")
  print(MUV_tasks)
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, MUV_tasks)
  valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, MUV_tasks)
  
  return MUV_tasks, (train_dataset, valid_dataset), transformers
+4 −2
Original line number Diff line number Diff line
@@ -17,7 +17,8 @@ from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import NormalizationTransformer

def load_nci(base_dir, reload=True, force_transform=False):
def load_nci(base_dir, reload=True, force_transform=False,
             shard_size=1000, num_shards_per_batch=4):
  """Load NCI datasets. Does not do train/test split"""
  # Set some global variables up top
  verbosity = "high"
@@ -70,7 +71,8 @@ def load_nci(base_dir, reload=True, force_transform=False):
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_paths, data_dir)
    dataset = loader.featurize(dataset_paths, data_dir, shard_size=shard_size,
                               num_shards_per_batch=num_shards_per_batch)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
+24 −4
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ from deepchem.featurizers.featurize import DataLoader
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.transformers import BalancingTransformer

def load_pcba(base_dir, reload=True):
def load_pcba(base_dir, reload=True, frac_train=.8):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
@@ -31,6 +31,8 @@ def load_pcba(base_dir, reload=True):
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")

  # Load PCBA dataset
  print("About to load PCBA dataset.")
@@ -43,7 +45,7 @@ def load_pcba(base_dir, reload=True):
  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_PCBA_tasks = [
  PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
      'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
@@ -70,7 +72,7 @@ def load_pcba(base_dir, reload=True):
      'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
      'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  loader = DataLoader(tasks=all_PCBA_tasks,
  loader = DataLoader(tasks=PCBA_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
@@ -89,4 +91,22 @@ def load_pcba(base_dir, reload=True):
    for transformer in transformers:
        transformer.transform(dataset)

  return all_PCBA_tasks, dataset, transformers
  print("About to perform train/valid/test split.")
  num_train = frac_train * len(dataset)
  X, y, w, ids = dataset.to_numpy()
  num_tasks = 120
  PCBA_tasks = PCBA_tasks[:num_tasks]
  print("Using following tasks")
  print(PCBA_tasks)
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, PCBA_tasks)
  valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, PCBA_tasks)

  
  return PCBA_tasks, dataset, transformers
+20 −9
Original line number Diff line number Diff line
@@ -14,12 +14,11 @@ from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.datasets import Dataset
from deepchem.transformers import BalancingTransformer

def load_tox21(base_dir, reload=True):
def load_tox21(base_dir, reload=True, num_train=7200):
  """Load Tox21 datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
@@ -30,8 +29,9 @@ def load_tox21(base_dir, reload=True):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train")
  valid_dir = os.path.join(base_dir, "valid")

  # Load Tox21 dataset
  print("About to load Tox21 dataset.")
@@ -44,19 +44,19 @@ def load_tox21(base_dir, reload=True):
  # Featurize Tox21 dataset
  print("About to featurize Tox21 dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
  tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
                 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
                 'SR-HSE', 'SR-MMP', 'SR-p53']

  if not reload or not os.path.exists(data_dir):
    loader = DataLoader(tasks=all_tox21_tasks,
    loader = DataLoader(tasks=tox21_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(
        dataset_file, data_dir, shard_size=8192)
  else:
    dataset = Dataset(data_dir, all_tox21_tasks, reload=True)
    dataset = Dataset(data_dir, tox21_tasks, reload=True)

  # Initialize transformers 
  transformers = [
@@ -66,4 +66,15 @@ def load_tox21(base_dir, reload=True):
    for transformer in transformers:
        transformer.transform(dataset)

  return all_tox21_tasks, dataset, transformers
  X, y, w, ids = dataset.to_numpy()
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train], y[num_train:]
  w_train, w_valid = w[:num_train], w[num_train:]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = Dataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, tox21_tasks)
  valid_dataset = Dataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, tox21_tasks)
  
  return tox21_tasks, (train_dataset, valid_dataset), transformers
Loading