Unverified Commit af54fae1 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1624 from VIGS25/split-transform-order

Swap Split-Transform order - II
parents 42fbae1d b17c7467
Loading
Loading
Loading
Loading
+18 −10
Original line number Diff line number Diff line
@@ -62,16 +62,14 @@ def load_nci(featurizer='ECFP', shard_size=1000, split='random', reload=True):

  dataset = loader.featurize(dataset_file, shard_size=shard_size)

  # Initialize transformers
  logger.info("About to transform data")
  if split == None:
    logger.info("Split is None, about to transform data")
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset)
    ]
    for transformer in transformers:
      dataset = transformer.transform(dataset)

  if split == None:
    return all_nci_tasks, (dataset, None, None), transformers

  splitters = {
@@ -80,9 +78,19 @@ def load_nci(featurizer='ECFP', shard_size=1000, split='random', reload=True):
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  logger.info("Performing new split.")
  logger.info("About to split data with {} splitter.".format(splitter))
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [
      deepchem.trans.NormalizationTransformer(transform_y=True, dataset=train)
  ]

  logger.info("About to transform dataset.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
+19 −9
Original line number Diff line number Diff line
@@ -82,16 +82,16 @@ def load_pcba_dataset(featurizer='ECFP',
      tasks=PCBA_tasks, smiles_field="smiles", featurizer=featurizer)

  dataset = loader.featurize(dataset_file)
  # Initialize transformers

  if split == None:
    transformers = [
        deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]

  logger.info("About to transform data")
    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

  if split == None:
    return PCBA_tasks, (dataset, None, None), transformers

  splitters = {
@@ -100,9 +100,19 @@ def load_pcba_dataset(featurizer='ECFP',
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  logger.info("Performing new split.")
  logger.info("About to split dataset using {} splitter.".format(split))
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [
      deepchem.trans.BalancingTransformer(transform_w=True, dataset=train)
  ]

  logger.info("About to transform dataset.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
+20 −7
Original line number Diff line number Diff line
@@ -111,16 +111,17 @@ def load_pdbbind_grid(split="random",
    loader = deepchem.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file, shard_size=8192)
    df = pd.read_csv(dataset_file)

    if split == None:
      transformers = [
          deepchem.trans.NormalizationTransformer(
              transform_y=True, dataset=dataset)
      ]

      logger.info("Split is None, about to transform data.")
      for transformer in transformers:
        dataset = transformer.transform(dataset)
    df = pd.read_csv(dataset_file)

    if split == None:
      return tasks, (dataset, None, None), transformers

    splitters = {
@@ -130,8 +131,20 @@ def load_pdbbind_grid(split="random",
        'time': deepchem.splits.TimeSplitterPDBbind(np.array(df['id']))
    }
    splitter = splitters[split]
    logger.info("About to split dataset with {} splitter.".format(split))
    train, valid, test = splitter.train_valid_test_split(dataset)

    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=train)
    ]

    logger.info("About to transform dataset.")
    for transformer in transformers:
      train = transformer.transform(train)
      valid = transformer.transform(valid)
      test = transformer.transform(test)

    if reload:
      deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                               transformers)
+19 −9
Original line number Diff line number Diff line
@@ -47,17 +47,16 @@ def load_ppb(featurizer='ECFP', split='index', reload=True):
      tasks=PPB_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  # Initialize transformers
  if split == None:
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset)
    ]

  logger.info("About to transform data")
    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

  if split == None:
    return PPB_tasks, (dataset, None, None), transformers

  splitters = {
@@ -66,8 +65,19 @@ def load_ppb(featurizer='ECFP', split='index', reload=True):
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split dataset with {} splitter.".format(split))
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [
      deepchem.trans.NormalizationTransformer(transform_y=True, dataset=train)
  ]

  logger.info("About to transform dataset.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
+21 −9
Original line number Diff line number Diff line
@@ -51,17 +51,16 @@ def load_sampl(featurizer='ECFP', split='index', reload=True, move_mean=True):
      tasks=SAMPL_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  # Initialize transformers
  if split == None:
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset, move_mean=move_mean)
    ]

  logger.info("About to transform data")
    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

  if split == None:
    return SAMPL_tasks, (dataset, None, None), transformers

  splitters = {
@@ -70,7 +69,20 @@ def load_sampl(featurizer='ECFP', split='index', reload=True, move_mean=True):
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split dataset with {} splitter.".format(split))
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [
      deepchem.trans.NormalizationTransformer(
          transform_y=True, dataset=train, move_mean=move_mean)
  ]

  logger.info("About to transform dataset.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
Loading