Commit 6c2aa460 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #194 from rbharath/featurization_refactor

Refactor Featurization Code
parents 43538660 fe358db6
Loading
Loading
Loading
Loading
+103 −135
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@ import multiprocessing as mp
from functools import partial
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.featurizers.featurize import FeaturizedSamples
from deepchem.utils.save import log

__author__ = "Bharath Ramsundar"
@@ -26,8 +25,7 @@ class Dataset(object):
  """
  Wrapper class for dataset transformed into X, y, w numpy ndarrays.
  """
  def __init__(self, data_dir=None, tasks=[], samples=None, featurizers=None, 
               use_user_specified_features=False,
  def __init__(self, data_dir=None, tasks=[], metadata_rows=None, #featurizers=None, 
               raw_data=None, verbosity=None, reload=False):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
@@ -38,60 +36,71 @@ class Dataset(object):
    assert verbosity in [None, "low", "high"]
    self.verbosity = verbosity

    if featurizers is not None:
      feature_types = [featurizer.__class__.__name__ for featurizer in featurizers]
    else:
      feature_types = None

    if not reload or not os.path.exists(self._get_metadata_filename()):
      log("About to start initializing dataset", self.verbosity)
      if use_user_specified_features:
        feature_types = ["user-specified-features"]

      if samples is not None and feature_types is not None:
        if not isinstance(feature_types, list):
          raise ValueError("feature_types must be a list or None.")

        write_dataset_single_partial = partial(
            write_dataset_single, data_dir=self.data_dir,
            feature_types=feature_types, tasks=tasks)

        metadata_rows = []
        # TODO(rbharath): Still a bit of information leakage.
        for ind, (df_file, df) in enumerate(
            zip(samples.dataset_files, samples.iterdataframes())):
          log("Writing data from file %s, number %d/%d"
              % (df_file, ind+1, len(samples.dataset_files)), self.verbosity)
          retval = write_dataset_single_partial((df_file, df))
          if retval is not None:
            metadata_rows.append(retval)

        self.metadata_df = pd.DataFrame(
            metadata_rows,
            columns=('df_file', 'task_names', 'ids',
                     'X', 'X-transformed', 'y', 'y-transformed',
                     'w', 'w-transformed',
                     'X_sums', 'X_sum_squares', 'X_n',
                     'y_sums', 'y_sum_squares', 'y_n'))
      if metadata_rows is not None:
        self.metadata_df = Dataset.construct_metadata(metadata_rows)
        self.save_to_disk()
      elif raw_data is not None:
        metadata_rows = []
        ids, X, y, w = raw_data
        metadata_rows.append(
            write_dataset_single(val=None, data_dir=self.data_dir, raw_data=raw_data,
                                 basename="data", tasks=tasks))
        self.metadata_df = pd.DataFrame(
            metadata_rows,
            columns=('df_file', 'task_names', 'ids',
                     'X', 'X-transformed', 'y', 'y-transformed',
                     'w', 'w-transformed',
                     'X_sums', 'X_sum_squares', 'X_n',
                     'y_sums', 'y_sum_squares', 'y_n'))
            Dataset.write_data_to_disk(self.data_dir, "data", tasks, X, y, w, ids))
        self.metadata_df = Dataset.construct_metadata(metadata_rows)
        self.save_to_disk()
      #if samples is None and feature_types is not None:  
      else:
        # Create an empty metadata dataframe to be filled at a later time
        basename = "metadata"
        df_file = "metadata.joblib"
        metadata_rows = [Dataset.write_data_to_disk(
            self.data_dir, basename, tasks)]
        self.metadata_df = Dataset.construct_metadata(metadata_rows)
        self.save_to_disk()

    else:
      log("Loading pre-existing metadata file.", self.verbosity)
      if os.path.exists(self._get_metadata_filename()):
        self.metadata_df = load_from_disk(self._get_metadata_filename())
      else:
        raise ValueError("No metadata found.")

  @staticmethod
  def write_dataframe(val, data_dir, featurizers=None, tasks=None,
                      raw_data=None, basename=None):
    """Writes data from dataframe to disk."""
    if featurizers is not None and tasks is not None:
      feature_types = [featurizer.__class__.__name__ for featurizer in featurizers]
      (basename, df) = val
      # TODO(rbharath): This is a hack. clean up.
      if not len(df):
        return None
      ids, X, y, w = _df_to_numpy(df, feature_types, tasks)
    else:
      ids, X, y, w = raw_data
      basename = ""
      assert X.shape[0] == y.shape[0]
      assert y.shape == w.shape
      assert len(ids) == X.shape[0]
    return Dataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)

  @staticmethod
  def construct_metadata(metadata_entries):
    """Construct a dataframe containing metadata.
  
    metadata_entries should have elements returned by write_data_to_disk
    above.
    """
    metadata_df = pd.DataFrame(
        metadata_entries,
        columns=('basename','task_names', 'ids',
                 'X', 'X-transformed', 'y', 'y-transformed',
                 'w', 'w-transformed',
                 'X_sums', 'X_sum_squares', 'X_n',
                 'y_sums', 'y_sum_squares', 'y_n'))
    return metadata_df

  @staticmethod
  def write_data_to_disk(data_dir, basename, tasks, X=None, y=None, w=None, ids=None):
    out_X = "%s-X.joblib" % basename
    out_X_transformed = "%s-X-transformed.joblib" % basename
    out_X_sums = "%s-X_sums.joblib" % basename
@@ -106,30 +115,30 @@ class Dataset(object):
    out_w_transformed = "%s-w-transformed.joblib" % basename
    out_ids = "%s-ids.joblib" % basename

        metadata_rows = []
        retval = ([df_file, tasks, out_ids,
                   out_X, out_X_transformed,
                   out_y, out_y_transformed,
                   out_w, out_w_transformed,
    if X is not None:
      save_to_disk(X, os.path.join(data_dir, out_X))
      save_to_disk(X, os.path.join(data_dir, out_X_transformed))
      X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
      save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
      save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
      save_to_disk(X_n, os.path.join(data_dir, out_X_n))
    if y is not None:
      save_to_disk(y, os.path.join(data_dir, out_y))
      save_to_disk(y, os.path.join(data_dir, out_y_transformed))
      y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)
      save_to_disk(y_sums, os.path.join(data_dir, out_y_sums))
      save_to_disk(y_sum_squares, os.path.join(data_dir, out_y_sum_squares))
      save_to_disk(y_n, os.path.join(data_dir, out_y_n))
    if w is not None:
      save_to_disk(w, os.path.join(data_dir, out_w))
      save_to_disk(w, os.path.join(data_dir, out_w_transformed))
    if ids is not None:
      save_to_disk(ids, os.path.join(data_dir, out_ids))
    return [basename, tasks, out_ids, out_X, out_X_transformed, out_y,
            out_y_transformed, out_w, out_w_transformed,
            out_X_sums, out_X_sum_squares, out_X_n,
                   out_y_sums, out_y_sum_squares, out_y_n])
        metadata_rows.append(retval)
            out_y_sums, out_y_sum_squares, out_y_n]
  
        self.metadata_df = pd.DataFrame(
            metadata_rows,
            columns=('df_file','task_names', 'ids',
                     'X', 'X-transformed', 'y', 'y-transformed',
                     'w', 'w-transformed',
                     'X_sums', 'X_sum_squares', 'X_n',
                     'y_sums', 'y_sum_squares', 'y_n'))
        self.save_to_disk()

    else:
      log("Loading pre-existing metadata file.", self.verbosity)
      if os.path.exists(self._get_metadata_filename()):
        self.metadata_df = load_from_disk(self._get_metadata_filename())
      else:
        raise ValueError("No metadata found.")

  def save_to_disk(self):
    """Save dataset to disk."""
@@ -212,7 +221,11 @@ class Dataset(object):
  def from_numpy(data_dir, X, y, w=None, ids=None, tasks=None):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    ######################################################### DEBUG
    if n_samples > 0:
      y = np.reshape(y, (n_samples, -1))
    ######################################################### DEBUG
    #y = np.reshape(y, (n_samples, -1))
    n_tasks = y.shape[1]
    if ids is None:
      ids = np.arange(n_samples)
@@ -223,6 +236,17 @@ class Dataset(object):
    raw_data = (ids, X, y, w)
    return Dataset(data_dir=data_dir, tasks=tasks, raw_data=raw_data)

  def select(self, select_dir, indices):
    """Creates a new dataset from a selection of indices from self."""
    ################################################### DEBUG
    indices = np.array(indices).astype(int)
    X, y, w, ids = self.to_numpy()
    tasks = self.get_task_names()
    ################################################### DEBUG
    X_sel, y_sel, w_sel, ids_sel = (
        X[indices], y[indices], w[indices], ids[indices])
    return Dataset.from_numpy(select_dir, X_sel, y_sel, w_sel, ids_sel, tasks)
    
  def to_numpy(self):
    """
    Transforms internal data into arrays X, y, w
@@ -396,63 +420,6 @@ def compute_sums_and_nb_sample(tensor, W=None):

# The following are all associated with Dataset, but are separate functions to
# make it easy to use multiprocessing.

def write_dataset_single(val, data_dir, feature_types=None, tasks=None,
                         raw_data=None, basename=None):
  """Writes files for single row (X, y, w, X-transformed, ...) to disk."""
  if feature_types is not None and tasks is not None:
    (df_file, df) = val
    # TODO(rbharath): This is a hack. clean up.
    if not len(df):
      return None
    ids, X, y, w = _df_to_numpy(df, feature_types, tasks)
  else:
    ids, X, y, w = raw_data
    df_file = ""
    assert X.shape[0] == y.shape[0]
    assert y.shape == w.shape
    assert len(ids) == X.shape[0]
  X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X)
  y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w)

  if feature_types is not None and tasks is not None:
    basename = os.path.splitext(os.path.basename(df_file))[0]
  out_X = "%s-X.joblib" % basename
  out_X_transformed = "%s-X-transformed.joblib" % basename
  out_X_sums = "%s-X_sums.joblib" % basename
  out_X_sum_squares = "%s-X_sum_squares.joblib" % basename
  out_X_n = "%s-X_n.joblib" % basename
  out_y = "%s-y.joblib" % basename
  out_y_transformed = "%s-y-transformed.joblib" % basename
  out_y_sums = "%s-y_sums.joblib" % basename
  out_y_sum_squares = "%s-y_sum_squares.joblib" % basename
  out_y_n = "%s-y_n.joblib" % basename
  out_w = "%s-w.joblib" % basename
  out_w_transformed = "%s-w-transformed.joblib" % basename
  out_ids = "%s-ids.joblib" % basename

  save_to_disk(X, os.path.join(data_dir, out_X))
  save_to_disk(y, os.path.join(data_dir, out_y))
  save_to_disk(w, os.path.join(data_dir, out_w))
  # Write moments to disk
  save_to_disk(X_sums, os.path.join(data_dir, out_X_sums))
  save_to_disk(X_sum_squares, os.path.join(data_dir, out_X_sum_squares))
  save_to_disk(X_n, os.path.join(data_dir, out_X_n))
  save_to_disk(y_sums, os.path.join(data_dir, out_y_sums))
  save_to_disk(y_sum_squares, os.path.join(data_dir, out_y_sum_squares))
  save_to_disk(y_n, os.path.join(data_dir, out_y_n))
  # Write X, y as transformed versions
  save_to_disk(X, os.path.join(data_dir, out_X_transformed))
  save_to_disk(y, os.path.join(data_dir, out_y_transformed))
  save_to_disk(w, os.path.join(data_dir, out_w_transformed))
  save_to_disk(ids, os.path.join(data_dir, out_ids))
  return([df_file, tasks, out_ids, out_X, out_X_transformed, out_y,
          out_y_transformed, out_w, out_w_transformed,
          out_X_sums, out_X_sum_squares, out_X_n,
          out_y_sums, out_y_sum_squares, out_y_n])

# TODO(rbharath): This function is complicated enough that it should have unit
# tests.
def _df_to_numpy(df, feature_types, tasks):
  """Transforms a featurized dataset df into standard set of numpy arrays"""
  if not set(feature_types).issubset(df.keys()):
@@ -461,7 +428,8 @@ def _df_to_numpy(df, feature_types, tasks):
  # perform common train/test split across all tasks
  n_samples = df.shape[0]
  n_tasks = len(tasks)
  y = np.hstack([np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
  y = np.hstack([
      np.reshape(np.array(df[task].values), (n_samples, 1)) for task in tasks])
  w = np.ones((n_samples, n_tasks))
  missing = np.zeros_like(y).astype(int)
  tensors = []
+29 −43
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ import deepchem
import tempfile, shutil
from deepchem.utils.save import load_from_disk
from deepchem.splits import SpecifiedSplitter
from deepchem.featurizers import UserDefinedFeaturizer 
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.datasets import Dataset
from deepchem.transformers import NormalizationTransformer
@@ -26,6 +27,7 @@ def load_bace(mode="regression", transform=True, split="20-80"):
  """Load BACE-1 dataset as regression/classification problem."""
  reload = True
  verbosity = "high"
  regen = False
  assert split in ["20-80", "80-20"]

  current_dir = os.path.dirname(os.path.realpath(__file__))
@@ -53,17 +55,12 @@ def load_bace(mode="regression", transform=True, split="20-80"):

  #Make directories to store the raw and featurized datasets.
  base_dir = tempfile.mkdtemp()
  feature_dir = os.path.join(base_dir, "features")
  samples_dir = os.path.join(base_dir, "samples")
  full_dir = os.path.join(base_dir, "full_dataset")
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")
  test_dir = os.path.join(base_dir, "test_dataset")
  model_dir = os.path.join(base_dir, "model")
  crystal_dir = os.path.join(base_dir, "crystal")
  crystal_feature_dir = os.path.join(base_dir, "crystal_feature")
  crystal_samples_dir = os.path.join(base_dir, "crystal_samples")


  if mode == "regression":
    bace_tasks = ["pIC50"]
@@ -71,42 +68,36 @@ def load_bace(mode="regression", transform=True, split="20-80"):
    bace_tasks = ["Class"]
  else:
    raise ValueError("Unknown mode %s" % mode)
  featurizers = [UserDefinedFeaturizer(user_specified_features)]
  featurizer = DataFeaturizer(tasks=bace_tasks,
                              smiles_field="mol",
                              id_field="CID",
                              user_specified_features=user_specified_features,
                              split_field="Model")
  featurized_samples = featurizer.featurize(
      dataset_file, feature_dir, samples_dir, shard_size=2000,
      reload=reload)

  crystal_featurized_samples = featurizer.featurize(
      crystal_dataset_file, crystal_feature_dir, crystal_samples_dir,
  shard_size=2000)
                              featurizers=featurizers)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
  if not reload or not os.path.exists(crystal_dir):
    crystal_dataset = featurizer.featurize(crystal_dataset_file, crystal_dir)
  else:
    crystal_dataset = Dataset(crystal_dir, reload=True)


  splitter = SpecifiedSplitter(verbosity=verbosity)
  train_samples, valid_samples, test_samples = splitter.train_valid_test_split(
      featurized_samples, train_dir, valid_dir, test_dir,
      reload=reload)
  if (not reload or not os.path.exists(train_dir) or not os.path.exists(valid_dir)
      or not os.path.exists(test_dir)):
    regen = True
    splitter = SpecifiedSplitter(dataset_file, "Model", verbosity=verbosity)
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, train_dir, valid_dir, test_dir)
  else:
    train_dataset = Dataset(train_dir, reload=True)
    valid_dataset = Dataset(valid_dir, reload=True)
    test_dataset = Dataset(test_dir, reload=True)

  #NOTE THE RENAMING:
  if split == "20-80":
    valid_samples, test_samples = test_samples, valid_samples

  train_dataset = Dataset(data_dir=train_dir, samples=train_samples, 
                          featurizers=[], tasks=bace_tasks,
                          use_user_specified_features=True)
  valid_dataset = Dataset(data_dir=valid_dir, samples=valid_samples, 
                          featurizers=[], tasks=bace_tasks,
                          use_user_specified_features=True)
  test_dataset = Dataset(data_dir=test_dir, samples=test_samples, 
                         featurizers=[], tasks=bace_tasks,
                         use_user_specified_features=True)
  crystal_dataset = Dataset(data_dir=crystal_dir,
                            samples=crystal_featurized_samples, 
                            featurizers=[], tasks=bace_tasks,
                            use_user_specified_features=True)
    valid_dataset, test_dataset = test_dataset, valid_dataset
  print("Number of compounds in train set")
  print(len(train_dataset))
  print("Number of compounds in validation set")
@@ -116,7 +107,7 @@ def load_bace(mode="regression", transform=True, split="20-80"):
  print("Number of compounds in crystal set")
  print(len(crystal_dataset))

  if transform:
  if transform and regen:
    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
@@ -130,14 +121,9 @@ def load_bace(mode="regression", transform=True, split="20-80"):
    input_transformers, output_transformers = [], []
  
  transformers = input_transformers + output_transformers
  for dataset in [train_dataset, valid_dataset, test_dataset, crystal_dataset]:
    for transformer in transformers:
      transformer.transform(train_dataset)
  for transformer in transformers:
      transformer.transform(valid_dataset)
  for transformer in transformers:
      transformer.transform(test_dataset)
  for transformer in transformers:
      transformer.transform(crystal_dataset)
        transformer.transform(dataset)

  return (bace_tasks, train_dataset, valid_dataset, test_dataset,
          crystal_dataset, output_transformers)
+10 −31
Original line number Diff line number Diff line
@@ -8,24 +8,11 @@ from __future__ import unicode_literals
import os
import numpy as np
import shutil
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import RandomSplitter
from deepchem.datasets import Dataset
from deepchem.transformers import BalancingTransformer
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.metrics import to_one_hot
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import relative_difference
from deepchem.utils.evaluate import Evaluator

def load_muv(base_dir, reload=True):
  """Load MUV datasets. Does not do train/test split"""
@@ -33,10 +20,10 @@ def load_muv(base_dir, reload=True):
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  #base_dir = "/scratch/users/rbharath/muv_multitask_analysis"
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
@@ -44,8 +31,6 @@ def load_muv(base_dir, reload=True):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  feature_dir = os.path.join(base_dir, "features")
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")

  # Load MUV dataset
@@ -66,26 +51,20 @@ def load_muv(base_dir, reload=True):

  featurizer = DataFeaturizer(tasks=all_MUV_tasks,
                              smiles_field="smiles",
                              compound_featurizers=featurizers,
                              featurizers=featurizers,
                              verbosity=verbosity)
  featurized_samples = featurizer.featurize(
      dataset_file, feature_dir,
      samples_dir, shard_size=8192,
      reload=reload)

  dataset = Dataset(data_dir=data_dir, samples=featurized_samples, 
                    featurizers=featurizers, tasks=all_MUV_tasks,
                    verbosity=verbosity, reload=reload)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  input_transformers = []
  output_transformers = []
  weight_transformers = [
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  transformers = input_transformers + output_transformers + weight_transformers
  if not reload:
  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_MUV_tasks, featurized_samples, dataset, transformers
  return all_MUV_tasks, dataset, transformers
+37 −56
Original line number Diff line number Diff line
@@ -8,31 +8,18 @@ from __future__ import unicode_literals
import os
import numpy as np
import shutil
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import RandomSplitter
from deepchem.datasets import Dataset
from deepchem.transformers import BalancingTransformer
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.metrics import to_one_hot
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import relative_difference
from deepchem.utils.evaluate import Evaluator

def load_pcba(base_dir, reload=True):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
@@ -43,8 +30,6 @@ def load_pcba(base_dir, reload=True):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  feature_dir = os.path.join(base_dir, "features")
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")

  # Load PCBA dataset
@@ -58,7 +43,8 @@ def load_pcba(base_dir, reload=True):
  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizers = [CircularFingerprint(size=1024)]
  all_PCBA_tasks = ['PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
  all_PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
      'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
      'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242',
@@ -86,26 +72,21 @@ def load_pcba(base_dir, reload=True):

  featurizer = DataFeaturizer(tasks=all_PCBA_tasks,
                              smiles_field="smiles",
                              compound_featurizers=featurizers,
                              featurizers=featurizers,
                              verbosity=verbosity)
  featurized_samples = featurizer.featurize(
      dataset_file, feature_dir,
      samples_dir, shard_size=8192,
      reload=reload)

  dataset = Dataset(data_dir=data_dir, samples=featurized_samples, 
                    featurizers=featurizers, tasks=all_PCBA_tasks,
                    verbosity=verbosity, reload=reload)
  if not reload or not os.path.exists(data_dir):
    dataset = featurizer.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  input_transformers = []
  output_transformers = []
  weight_transformers = [
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  transformers = input_transformers + output_transformers + weight_transformers
  if not reload:

  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_PCBA_tasks, featurized_samples, dataset, transformers
  return all_PCBA_tasks, dataset, transformers
+38 −60

File changed.

Preview size limit exceeded, changes collapsed.

Loading