Commit f0b2f002 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Merge pull request #192 from rbharath/pcba_sklearn

PCBA Logistic models
parents eea57425 c04f8f3c
Loading
Loading
Loading
Loading
+0 −3
Original line number Diff line number Diff line
@@ -365,8 +365,6 @@ class Dataset(object):
      save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
      save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))

 

def compute_sums_and_nb_sample(tensor, W=None):
  """
  Computes sums, squared sums of tensor along axis 0.
@@ -510,4 +508,3 @@ def _df_to_numpy(df, feature_types, tasks):
  # Adding this assertion in to avoid ill-formed outputs.
  assert len(sorted_ids) == len(x) == len(y) == len(w)
  return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
+1 −1
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@ from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import relative_difference
from deepchem.utils.evaluate import Evaluator

def load_muv(base_dir, mode="regression", transform=True, reload=True):
def load_muv(base_dir, reload=True):
  """Load MUV datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
+111 −0
Original line number Diff line number Diff line
"""
PCBA dataset loader.
"""
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import os
import numpy as np
import shutil
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from deepchem.utils.save import load_from_disk
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.fingerprints import CircularFingerprint
from deepchem.splits import ScaffoldSplitter
from deepchem.splits import RandomSplitter
from deepchem.datasets import Dataset
from deepchem.transformers import BalancingTransformer
from deepchem.hyperparameters import HyperparamOpt
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.metrics import to_one_hot
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import relative_difference
from deepchem.utils.evaluate import Evaluator

def load_pcba(base_dir, reload=True):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  feature_dir = os.path.join(base_dir, "features")
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")

  # Load PCBA dataset
  print("About to load PCBA dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/pcba.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizers = [CircularFingerprint(size=1024)]
  all_PCBA_tasks = ['PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
                    'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
                    'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
                    'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242',
                    'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546',
                    'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676',
                    'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294',
                    'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349',
                    'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947',
                    'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339',
                    'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842',
                    'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317',
                    'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590',
                    'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233',
                    'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171',
                    'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288',
                    'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635',
                    'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104',
                    'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979',
                    'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553',
                    'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709',
                    'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881',
                    'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899',
                    'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
                    'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  featurizer = DataFeaturizer(tasks=all_PCBA_tasks,
                              smiles_field="smiles",
                              compound_featurizers=featurizers,
                              verbosity=verbosity)
  featurized_samples = featurizer.featurize(
      dataset_file, feature_dir,
      samples_dir, shard_size=8192,
      reload=reload)

  dataset = Dataset(data_dir=data_dir, samples=featurized_samples, 
                    featurizers=featurizers, tasks=all_PCBA_tasks,
                    verbosity=verbosity, reload=reload)

  # Initialize transformers 
  input_transformers = []
  output_transformers = []
  weight_transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  transformers = input_transformers + output_transformers + weight_transformers
  if not reload:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_PCBA_tasks, featurized_samples, dataset, transformers
+2 −1
Original line number Diff line number Diff line
@@ -163,7 +163,6 @@ class DataFeaturizer(object):
                                                  input_type=input_type)


      raw_df = raw_df.apply(process_raw_sample_helper_partial, axis=1, reduce=False)
      nb_sample = raw_df.shape[0]
      interval_points = np.linspace(
          0, nb_sample, np.ceil(float(nb_sample)/shard_size)+1, dtype=int)
@@ -172,6 +171,8 @@ class DataFeaturizer(object):
        log("Sharding and standardizing into shard-%s / %s shards"
            % (str(j+1), len(interval_points)-1), self.verbosity)
        raw_df_shard = raw_df.iloc[range(interval_points[j], interval_points[j+1])]
        raw_df_shard = raw_df_shard.apply(
            process_raw_sample_helper_partial, axis=1, reduce=False)
        
        df = self._standardize_df(raw_df_shard) 

+27 −14
Original line number Diff line number Diff line
@@ -25,6 +25,8 @@ class SingletaskToMultitask(Model):
    self.model_params = model_params
    self.models = {}
    self.model_dir = model_dir
    self.task_model_dirs = {}
    self.model_builder = model_builder
    self.verbosity = verbosity
    log("About to initialize singletask to multitask model",
        self.verbosity, "high")
@@ -38,9 +40,7 @@ class SingletaskToMultitask(Model):
        os.makedirs(task_model_dir)
      log("Initializing model for task %s" % task,
          self.verbosity, "high")
      self.models[task] = model_builder([tasks], task_types, model_params,
                                        task_model_dir,
                                        verbosity=verbosity)
      self.task_model_dirs[task] = task_model_dir
      
  def fit(self, dataset):
    """
@@ -55,7 +55,11 @@ class SingletaskToMultitask(Model):
      w_task = w[:, ind]
      X_task = X[w_task != 0, :]
      y_task = y_task[w_task != 0]
      self.models[task].raw_model.fit(X_task, y_task)
      task_model = self.model_builder([task], {task: self.task_types[task]}, self.model_params,
                                      self.task_model_dirs[task],
                                      verbosity=self.verbosity)
      task_model.raw_model.fit(X_task, y_task)
      task_model.save()

  def predict_on_batch(self, X):
    """
@@ -66,12 +70,18 @@ class SingletaskToMultitask(Model):
    y_pred = np.zeros((n_samples, n_tasks))
    for ind, task in enumerate(self.tasks):
      task_type = self.task_types[task]
      task_model = self.model_builder([task], {task: self.task_types[task]}, self.model_params,
                                      self.task_model_dirs[task],
                                      verbosity=self.verbosity)
      task_model.reload()

      if task_type == "classification":
        y_pred[:, ind] = self.models[task].predict_on_batch(X)
        y_pred[:, ind] = task_model.predict_on_batch(X)
      elif task_type == "regression":
        y_pred[:, ind] = self.models[task].predict_on_batch(X)
        y_pred[:, ind] = task_model.predict_on_batch(X)
      else:
        raise ValueError("Invalid task_type")
      ############################################### DEBUG
    return y_pred

  def predict_proba_on_batch(self, X, n_classes=2):
@@ -82,17 +92,20 @@ class SingletaskToMultitask(Model):
    n_samples = X.shape[0]
    y_pred = np.zeros((n_samples, n_tasks, n_classes))
    for ind, task in enumerate(self.tasks):
      y_pred[:, ind] = self.models[task].predict_proba_on_batch(X)
      task_model = self.model_builder([task], {task: self.task_types[task]}, self.model_params,
                                      self.task_model_dirs[task],
                                      verbosity=self.verbosity)
      task_model.reload()

      y_pred[:, ind] = task_model.predict_proba_on_batch(X)
    return y_pred

  def save(self):
    """Save all models"""
    for task in self.tasks:
      log("Saving model for task %s" % task, self.verbosity, "high")
      self.models[task].save()
    # Saving is done on-the-fly
    pass

  def load(self):
  def reload(self):
    """Load all models"""
    for task in self.tasks:
      log("Loading model for task %s" % task, self.verbosity, "high")
      self.models[task].load()
    # Loading is done on-the-fly
    pass
Loading