Commit f40abcbd authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

First round of refactoring to fit new structure.

parent 31b922a4
Loading
Loading
Loading
Loading
+19 −19
Original line number Diff line number Diff line
"""
Implements multitask-RF learning on Google-vs datasets.
Convenience script to train basic models on supported datasets.
"""
import numpy as np
from dataset_arxiv import dataset_to_numpy
from dataset_arxiv import get_target_names
from dataset_arxiv import load_datasets
from dataset_arxiv import load_and_transform_dataset
from dataset_arxiv import train_test_random_split
from dataset_arxiv import train_test_scaffold_split
from dataset_arxiv import scaffold_separate
from dataset_arxiv import eval_model
from dataset_arxiv import compute_roc_auc_scores
from dataset_arxiv import compute_r2_scores
from dataset_arxiv import compute_rms_scores
from dataset_arxiv import multitask_to_singletask
from dataset_arxiv import compare_datasets
from keras_mlp import fit_singletask_mlp
from keras_mlp import fit_multitask_mlp
from keras_mlp import train_multitask_model
from sklearn_models import fit_singletask_models
from sklearn_models import fit_multitask_rf
from deep_chem.models.keras import fit_singletask_mlp
from deep_chem.models.keras import fit_multitask_mlp
from deep_chem.models.keras import train_multitask_model
from deep_chem.models.sklearn import fit_singletask_models
from deep_chem.models.sklearn import fit_multitask_rf
from deep_chem.utils.analysis import compare_datasets
from deep_chem.utils.evaluate import eval_model
from deep_chem.utils.evaluate import compute_roc_auc_scores
from deep_chem.utils.evaluate import compute_r2_scores
from deep_chem.utils.evaluate import compute_rms_scores
from deep_chem.utils.load import get_target_names
from deep_chem.utils.load import load_datasets
from deep_chem.utils.load import load_and_transform_dataset
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.preprocess import train_test_random_split
from deep_chem.utils.preprocess import train_test_scaffold_split
from deep_chem.utils.preprocess import scaffold_separate
from deep_chem.utils.preprocess import multitask_to_singletask

def filter_outliers(X, y):
  """Removes outlier values from dataset.
+82 −0
Original line number Diff line number Diff line
"""
Utility functions to compare datasets to one another.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

def summarize_distribution(y):
  """Analyzes regression dataset.

  Parameters
  ----------
  y: np.ndarray 
    A 1D numpy array containing distribution.
  """
  mean = np.mean(y)
  std = np.std(y)
  minval = np.amin(y)
  maxval = np.amax(y)
  hist = np.histogram(y)
  print "Mean: %f" % mean
  print "Std: %f" % std
  print "Min: %f" % minval
  print "Max: %f" % maxval
  print "Histogram: "
  print hist

def compare_all_datasets():
  """Compare all datasets in our collection.

  TODO(rbharath): Make this actually robust.
  """
  muv_path = "/home/rbharath/vs-datasets/muv"
  pcba_path = "/home/rbharath/vs-datasets/pcba"
  dude_path = "/home/rbharath/vs-datasets/dude"
  pfizer_path = "/home/rbharath/private-datasets/pfizer"
  muv_data = load_datasets([muv_path])
  pcba_data = load_datasets([pcba_path])
  dude_data = load_datasets([dude_path])
  pfizer_data = load_datasets([pfizer_path])
  print "----------------------"
  compare_datasets("muv", muv_data, "pcba", pcba_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "muv", muv_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "pcba", pcba_data)
  print "----------------------"
  compare_datasets("muv", muv_data, "dude", dude_data)
  print "----------------------"
  compare_datasets("pcba", pcba_data, "dude", dude_data)
  print "----------------------"
  compare_datasets("pfizer", pfizer_data, "dude", dude_data)

def compare_datasets(first_name, first, second_name, second):
  """Counts the overlap between two provided datasets.

  Parameters
  ----------
  first_name: string
    Name of first dataset
  first: dict
    Data dictionary generated by load_datasets.
  second_name: string
    Name of second dataset
  second: dict
    Data dictionary generated by load_datasets.
  """
  first_scaffolds = set()
  for key in first:
    _, scaffold, _ = first[key]
    first_scaffolds.add(scaffold)
  print "%d molecules in %s dataset" % (len(first), first_name)
  print "%d scaffolds in %s dataset" % (len(first_scaffolds), first_name)
  second_scaffolds = set()
  for key in second:
    _, scaffold, _ = second[key]
    second_scaffolds.add(scaffold)
  print "%d molecules in %s dataset" % (len(second), second_name)
  print "%d scaffolds in %s dataset" % (len(second_scaffolds), second_name)
  common_scaffolds = first_scaffolds.intersection(second_scaffolds)
  print "%d scaffolds in both" % len(common_scaffolds)
+18 −0
Original line number Diff line number Diff line
"""
Code for processing the Google vs-datasets.
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2015, Stanford University"
__license__ = "LGPL"

import os
import numpy as np
import gzip
import cPickle as pickle
import csv
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
Loading