Commit c2c07365 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Some more pylint fixes.

parent 41e399bf
Loading
Loading
Loading
Loading
+18 −24
Original line number Diff line number Diff line
"""
Code for processing datasets using scikit-learn.
"""
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import MultiTaskLasso 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LassoLarsCV
from sklearn.svm import SVR

def fit_singletask_models(train_data, modeltype, task_types):
def fit_singletask_models(train_data, modeltype):
  """Fits singletask linear regression models to potency.

  Parameters
@@ -26,23 +23,20 @@ def fit_singletask_models(train_data, modeltype, task_types):
    Type of split for train/test. Either random or scaffold.
  seed: int (optional)
    Seed to initialize np.random.
  task_types: dict 
    dict mapping target names to output type. Each output type must be either
    "classification" or "regression".
  output_transforms: dict
    dict mapping target names to label transform. Each output type must be either
    None or "log". Only for regression outputs.
  """
  models = {}
  for index, target in enumerate(sorted(train_data.keys())):
  for target in sorted(train_data.keys()):
    print "Building model for target %s" % target
    (_, X_train, y_train, W_train) = train_data[target]
    (_, X_train, y_train, _) = train_data[target]
    if modeltype == "rf_regressor":
      model = RandomForestRegressor(n_estimators=500, n_jobs=-1,
          warm_start=True, max_features="sqrt")
      model = RandomForestRegressor(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
    elif modeltype == "rf_classifier":
      model = RandomForestClassifier(n_estimators=500, n_jobs=-1,
          warm_start=True, max_features="sqrt")
      model = RandomForestClassifier(
          n_estimators=500, n_jobs=-1, warm_start=True, max_features="sqrt")
    elif modeltype == "logistic":
      model = LogisticRegression(class_weight="auto")
    elif modeltype == "linear":
@@ -63,11 +57,11 @@ def fit_singletask_models(train_data, modeltype, task_types):

# TODO(rbharath): I believe this is broken. Update it to work with the rest of
# the package.
def fit_multitask_rf(train_data, task_types):
def fit_multitask_rf(train_data):
  """Fits a multitask RF model to provided dataset.
  """
  (_, X_train, y_train, _) = train_data
  model = RandomForestClassifier(n_estimators=100, n_jobs=-1,
      class_weight="auto")
  model = RandomForestClassifier(
      n_estimators=100, n_jobs=-1, class_weight="auto")
  model.fit(X_train, y_train)
  return model
+1 −1
Original line number Diff line number Diff line
@@ -453,7 +453,7 @@ def _fit_model(paths, model, task_type, n_hidden, learning_rate, dropout,
    models = fit_3D_convolution(
        train_dict, task_types, nb_epoch=n_epochs, batch_size=batch_size)
  else:
    models = fit_singletask_models(train_dict, model, task_types)
    models = fit_singletask_models(train_dict, model)
  modeltype = get_model_type(model)
  save_model(models, modeltype, saved_out)

+43 −39
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@ import csv
import numpy as np
import warnings
import sys
from deep_chem.utils.preprocess import dataset_to_numpy
from deep_chem.utils.preprocess import labels_to_weights
from deep_chem.utils.preprocess import undo_transform_outputs
from sklearn.metrics import mean_squared_error
@@ -20,21 +19,24 @@ from sklearn.metrics import r2_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from rdkit import Chem
from rdkit.Chem.Descriptors import ExactMolWt

def compute_model_performance(raw_test_data, test_data, task_types, models,
  modeltype, output_transforms, aucs=True, r2s=False, rms=False, recall=False,
  accuracy=False, mcc=False, print_file=sys.stdout):
                              modeltype, output_transforms, aucs=True,
                              r2s=False, rms=False, recall=False,
                              accuracy=False, mcc=False,
                              print_file=sys.stdout):
  """Computes statistics for model performance on test set."""
  all_results, auc_vals, r2_vals, rms_vals, mcc_vals, recall_vals, accuracy_vals = {}, {}, {}, {}, {}, {}, {}
  all_results = {}
  auc_vals, mcc_vals, recall_vals, accuracy_vals = {}, {}, {}, {}
  r2_vals, rms_vals = {}, {}
  for index, target in enumerate(sorted(test_data.keys())):
    print("Evaluating model %d" % index, file=print_file)
    print("Target %s" % target, file=print_file)
    (test_ids, Xtest, ytest, wtest) = test_data[target]
    (test_ids, X_test, y_test, w_test) = test_data[target]
    (_, _, ytest_raw, _) = raw_test_data[target]
    model = models[target]
    results = eval_model(test_ids, Xtest, ytest, ytest_raw, wtest, model,
    results = eval_model(
        test_ids, X_test, y_test, ytest_raw, w_test, model,
        {target: task_types[target]}, modeltype=modeltype,
        output_transforms=output_transforms)
    all_results[target] = results[target]
@@ -93,7 +95,8 @@ def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
  # an upstream change so the evaluator doesn't have to worry about this.
  if len(np.shape(X)) > 2:  # Dealing with 3D data
    if len(np.shape(X)) != 5:
      raise ValueError("Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
      raise ValueError(
          "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).")
    (n_samples, axis_length, _, _, n_channels) = np.shape(X)
    X = np.reshape(X, (n_samples, axis_length, n_channels, axis_length, axis_length))
  if modeltype == "keras-graph":
@@ -112,13 +115,14 @@ def model_predictions(X, model, n_targets, task_types, modeltype="sklearn"):
    ypreds = model.predict(X)
  else:
    raise ValueError("Improper modeltype.")
  if type(ypreds) == np.ndarray:
  if isinstance(ypreds, np.ndarray):
    ypreds = np.squeeze(ypreds)
  if type(ypreds) != list:
  if not isinstance(ypreds, list):
    ypreds = [ypreds]
  return ypreds

def eval_model(ids, X, Ytrue, Ytrue_raw, W, model, task_types, output_transforms, modeltype="sklearn"):
def eval_model(ids, X, Ytrue, Ytrue_raw, W, model, task_types,
               output_transforms, modeltype="sklearn"):
  """Evaluates the provided model on the test-set.

  Returns a dict which maps target-names to pairs of np.ndarrays (ytrue,
@@ -140,11 +144,11 @@ def eval_model(ids, X, Ytrue, Ytrue_raw, W, model, task_types, output_transforms
    Either sklearn, keras, or keras_multitask
  """
  sorted_targets = sorted(task_types.keys())
  ypreds = model_predictions(X, model, len(task_types),
      task_types, modeltype=modeltype)
  ypreds = model_predictions(
      X, model, len(task_types), task_types, modeltype=modeltype)
  results = {}
  for target_ind, target in enumerate(sorted_targets):
    ytrue_raw, ytrue, ypred = Ytrue_raw[:, target_ind], Ytrue[:, target_ind], ypreds[target_ind]
    ytrue_raw, _, ypred = Ytrue_raw[:, target_ind], Ytrue[:, target_ind], ypreds[target_ind]
    ypred = undo_transform_outputs(ytrue_raw, ypred, output_transforms)
    results[target] = (ids, np.squeeze(ytrue_raw), np.squeeze(ypred))
  return results
@@ -156,13 +160,13 @@ def results_to_csv(results, out, task_type="classification"):
    if task_type == "classification":
      yscores = np.around(yscores[:, 1]).astype(int)
    elif task_type == "regression":
      if type(yscores[0]) == np.ndarray:
      if isinstance(yscores[0], np.ndarray):
        yscores = yscores[:, 0]
    with open(out, "wb") as csvfile:
      csvwriter = csv.writer(csvfile, delimiter="\t")
      csvwriter.writerow(["Ids", "True", "Model-Prediction"])
      for id, ytrue, yscore in zip(mol_ids, ytrues, yscores):
        csvwriter.writerow([id, ytrue, yscore])
      for mol_id, ytrue, yscore in zip(mol_ids, ytrues, yscores):
        csvwriter.writerow([mol_id, ytrue, yscore])
    print("Writing results on test set for target %s to %s" % (target, out))


@@ -228,7 +232,7 @@ def compute_roc_auc_scores(results, task_types):
    sample_weights = labels_to_weights(ytrue)
    try:
      score = roc_auc_score(ytrue, yscore[:, 1], sample_weight=sample_weights)
    except Exception as e:
    except Exception:
      warnings.warn("ROC AUC score calculation failed.")
      score = 0.5
    print("Target %s: AUC %f" % (target, score))