Commit 578bea4c authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Pylinted all the code.

parent 91e53855
Loading
Loading
Loading
Loading
+11 −13
Original line number Original line Diff line number Diff line
@@ -24,6 +24,7 @@ class Model(object):
    self.model_type = model_type
    self.model_type = model_type
    self.task_types = task_types
    self.task_types = task_types
    self.model_params = model_params
    self.model_params = model_params
    self.raw_model = None


  def fit_on_batch(self, X, y, w):
  def fit_on_batch(self, X, y, w):
    """
    """
@@ -49,7 +50,7 @@ class Model(object):
    """
    """
    Return raw model.
    Return raw model.
    """
    """
    return(self.raw_model)
    return self.raw_model


  @staticmethod
  @staticmethod
  def get_model_filename(out_dir):
  def get_model_filename(out_dir):
@@ -130,7 +131,7 @@ class Model(object):
        print("Training on batch-%s/epoch-%s" % (str(i+1), str(epoch+1)))
        print("Training on batch-%s/epoch-%s" % (str(i+1), str(epoch+1)))
        nb_sample = np.shape(X)[0]
        nb_sample = np.shape(X)[0]
        interval_points = np.linspace(
        interval_points = np.linspace(
            0, nb_sample, np.ceil(float(nb_sample)/batch_size)+1).astype(int)
            0, nb_sample, np.ceil(float(nb_sample)/batch_size)+1, dtype=int)
        for j in range(len(interval_points)-1):
        for j in range(len(interval_points)-1):
          indices = range(interval_points[j], interval_points[j+1])
          indices = range(interval_points[j], interval_points[j+1])
          X_batch = X[indices, :]
          X_batch = X[indices, :]
@@ -138,9 +139,6 @@ class Model(object):
          w_batch = w[indices]
          w_batch = w[indices]
          self.fit_on_batch(X_batch, y_batch, w_batch)
          self.fit_on_batch(X_batch, y_batch, w_batch)


  # TODO(rbharath): What does this function do when y is not provided. Suspect
  # it breaks. Need to fix.

  # TODO(rbharath): The structure of the produced df might be
  # TODO(rbharath): The structure of the produced df might be
  # complicated. Better way to model?
  # complicated. Better way to model?
  def predict(self, dataset):
  def predict(self, dataset):
@@ -158,9 +156,9 @@ class Model(object):
    for (X, y, w, ids) in dataset.itershards():
    for (X, y, w, ids) in dataset.itershards():
      nb_sample = np.shape(X)[0]
      nb_sample = np.shape(X)[0]
      interval_points = np.linspace(
      interval_points = np.linspace(
          0, nb_sample, np.ceil(float(nb_sample)/batch_size)+1).astype(int)
          0, nb_sample, np.ceil(float(nb_sample)/batch_size)+1, dtype=int)
      y_preds = []
      y_preds = []
      for j in range(0,len(interval_points)-1):
      for j in range(len(interval_points)-1):
        indices = range(interval_points[j], interval_points[j+1])
        indices = range(interval_points[j], interval_points[j+1])
        y_preds.append(self.predict_on_batch(X[indices, :]))
        y_preds.append(self.predict_on_batch(X[indices, :]))
      y_pred = np.concatenate(y_preds)
      y_pred = np.concatenate(y_preds)
+9 −6
Original line number Original line Diff line number Diff line
@@ -66,7 +66,8 @@ class MultiTaskDNN(KerasModel):
          Dense(model_params["nb_hidden"], init='uniform',
          Dense(model_params["nb_hidden"], init='uniform',
                activation=model_params["activation"]),
                activation=model_params["activation"]),
          name="dense", input="input")
          name="dense", input="input")
      model.add_node(Dropout(model_params["dropout"]), name="dropout",
      model.add_node(Dropout(model_params["dropout"]),
                     name="dropout",
                     input="dense")
                     input="dense")
      top_layer = "dropout"
      top_layer = "dropout"
      for ind, task in enumerate(sorted_tasks):
      for ind, task in enumerate(sorted_tasks):
@@ -96,6 +97,7 @@ class MultiTaskDNN(KerasModel):
      self.raw_model = model
      self.raw_model = model


  def get_data_dict(self, X, y=None):
  def get_data_dict(self, X, y=None):
    """Wrap data X in dict for graph computations (Keras graph only for now)."""
    data = {}
    data = {}
    data["input"] = X
    data["input"] = X
    for ind, task in enumerate(sorted(self.task_types.keys())):
    for ind, task in enumerate(sorted(self.task_types.keys())):
@@ -110,7 +112,7 @@ class MultiTaskDNN(KerasModel):
  def get_sample_weight(self, w):
  def get_sample_weight(self, w):
    """Get dictionaries needed to fit models"""
    """Get dictionaries needed to fit models"""
    sample_weight = {}
    sample_weight = {}
    for ind, task in enumerate(sorted(self.task_types.keys())):
    for ind in range(len(sorted(self.task_types.keys()))):
      sample_weight["task%d" % ind] = w[:, ind]
      sample_weight["task%d" % ind] = w[:, ind]
    return sample_weight
    return sample_weight


@@ -124,6 +126,7 @@ class MultiTaskDNN(KerasModel):
    data = self.get_data_dict(X, y)
    data = self.get_data_dict(X, y)
    sample_weight = self.get_sample_weight(w)
    sample_weight = self.get_sample_weight(w)
    loss = self.raw_model.train_on_batch(data, sample_weight=sample_weight)
    loss = self.raw_model.train_on_batch(data, sample_weight=sample_weight)
    return loss


  def predict_on_batch(self, X):
  def predict_on_batch(self, X):
    """
    """
+12 −5
Original line number Original line Diff line number Diff line
@@ -15,16 +15,23 @@ from deepchem.models import Model
from deepchem.models.deep import KerasModel
from deepchem.models.deep import KerasModel


def shuffle_shape(shape):
def shuffle_shape(shape):
  """
  Shuffle shape of form (N, N, N, C) into (C, N, N, N).
  """
  (axis_length, _, _, n_channels) = shape
  (axis_length, _, _, n_channels) = shape
  shuffled_shape = (n_channels, axis_length, axis_length, axis_length)
  shuffled_shape = (n_channels, axis_length, axis_length, axis_length)
  return shuffled_shape
  return shuffled_shape


def shuffle_data(X):
def shuffle_data(X):
  """
  Make data of shape (C, N, N, N) from (N, N, N, C)

  C is n_channels, N is axis_length.
  """
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  (n_samples, axis_length, _, _, n_channels) = np.shape(X)
  X = np.reshape(X, (n_samples, n_channels, axis_length, axis_length, axis_length))
  X = np.reshape(X, (n_samples, n_channels, axis_length, axis_length, axis_length))
  return X
  return X



class DockingDNN(KerasModel):
class DockingDNN(KerasModel):
  """
  """
  Wrapper class for fitting 3D convolutional networks for deep docking.
  Wrapper class for fitting 3D convolutional networks for deep docking.
+17 −20
Original line number Original line Diff line number Diff line
@@ -7,6 +7,7 @@ from __future__ import unicode_literals
import argparse
import argparse
import glob
import glob
import os
import os
import multiprocessing as mp
from functools import partial
from functools import partial
from deepchem.utils.featurize import DataFeaturizer
from deepchem.utils.featurize import DataFeaturizer
from deepchem.utils.featurize import FeaturizedSamples
from deepchem.utils.featurize import FeaturizedSamples
@@ -59,10 +60,6 @@ def add_transforms_group(cmd):
      choices=["normalize", "log"],
      choices=["normalize", "log"],
      help="Supported transforms are 'log' and 'normalize'. 'None' will be taken\n"
      help="Supported transforms are 'log' and 'normalize'. 'None' will be taken\n"
           "to mean no transforms are required.")
           "to mean no transforms are required.")
  transform_group.add_argument(
      "--mode", default="singletask",
      choices=["singletask", "multitask"],
      help="Type of model being built.")
  transform_group.add_argument(
  transform_group.add_argument(
      "--feature-types", nargs="+", required=1,
      "--feature-types", nargs="+", required=1,
      choices=["user-specified-features", "ECFP", "RDKIT-descriptors"],
      choices=["user-specified-features", "ECFP", "RDKIT-descriptors"],
@@ -175,9 +172,10 @@ def extract_model_params(args):
            "activation", "momentum", "nesterov"]
            "activation", "momentum", "nesterov"]


  model_params = {param : getattr(args, param) for param in params}
  model_params = {param : getattr(args, param) for param in params}
  return(model_params)
  return model_params


def ensure_exists(dirs):
def ensure_exists(dirs):
  """Creates dirs if they don't exist."""
  for directory in dirs:
  for directory in dirs:
    if not os.path.exists(directory):
    if not os.path.exists(directory):
      os.makedirs(directory)
      os.makedirs(directory)
@@ -201,7 +199,6 @@ def create_model(args):
                                        args.data_dir)
                                        args.data_dir)
    ensure_exists([feature_dir, data_dir, model_dir])
    ensure_exists([feature_dir, data_dir, model_dir])



  if args.featurize:
  if args.featurize:
    print("+++++++++++++++++++++++++++++++++")
    print("+++++++++++++++++++++++++++++++++")
    print("Perform featurization")
    print("Perform featurization")
@@ -214,7 +211,7 @@ def create_model(args):
    print("+++++++++++++++++++++++++++++++++")
    print("+++++++++++++++++++++++++++++++++")
    print("Generate dataset for featurized samples")
    print("Generate dataset for featurized samples")
    samples_dir = os.path.join(data_dir, "samples")
    samples_dir = os.path.join(data_dir, "samples")
    samples = FeaturizedSamples(samples_dir, reload=True)
    samples = FeaturizedSamples(samples_dir, reload_data=True)


    print("Generating dataset.")
    print("Generating dataset.")
    full_data_dir = os.path.join(data_dir, "full-data")
    full_data_dir = os.path.join(data_dir, "full-data")
@@ -227,10 +224,9 @@ def create_model(args):
  if args.train_test_split:
  if args.train_test_split:
    print("+++++++++++++++++++++++++++++++++")
    print("+++++++++++++++++++++++++++++++++")
    print("Perform train-test split")
    print("Perform train-test split")
    paths = [feature_dir]
    train_test_split(
    train_test_split(
        paths, args.input_transforms, args.output_transforms, args.feature_types,
        args.input_transforms, args.output_transforms, args.feature_types,
        args.splittype, args.mode, data_dir)
        args.splittype, data_dir)


  if args.fit:
  if args.fit:
    print("+++++++++++++++++++++++++++++++++")
    print("+++++++++++++++++++++++++++++++++")
@@ -281,6 +277,7 @@ def featurize_inputs(feature_dir, data_dir, input_files,
                     user_specified_features, tasks, smiles_field,
                     user_specified_features, tasks, smiles_field,
                     split_field, id_field, threshold, parallel):
                     split_field, id_field, threshold, parallel):


  """Allows for parallel data featurization."""
  featurize_input_partial = partial(featurize_input,
  featurize_input_partial = partial(featurize_input,
                                    feature_dir=feature_dir,
                                    feature_dir=feature_dir,
                                    user_specified_features=user_specified_features,
                                    user_specified_features=user_specified_features,
@@ -302,7 +299,7 @@ def featurize_inputs(feature_dir, data_dir, input_files,


  print("Writing samples to disk.")
  print("Writing samples to disk.")
  samples_dir = os.path.join(data_dir, "samples")
  samples_dir = os.path.join(data_dir, "samples")
  samples = FeaturizedSamples(samples_dir, dataset_files)
  FeaturizedSamples(samples_dir, dataset_files)


def featurize_input(input_file, feature_dir, user_specified_features, tasks,
def featurize_input(input_file, feature_dir, user_specified_features, tasks,
                    smiles_field, split_field, id_field, threshold):
                    smiles_field, split_field, id_field, threshold):
@@ -318,18 +315,18 @@ def featurize_input(input_file, feature_dir, user_specified_features, tasks,
      feature_dir, "%s.joblib" %(os.path.splitext(os.path.basename(input_file))[0]))
      feature_dir, "%s.joblib" %(os.path.splitext(os.path.basename(input_file))[0]))
  featurizer.featurize(input_file, FeaturizedSamples.feature_types, out)
  featurizer.featurize(input_file, FeaturizedSamples.feature_types, out)


def train_test_split(paths, input_transforms, output_transforms,
def train_test_split(input_transforms, output_transforms,
                     feature_types, splittype, mode, data_dir):
                     feature_types, splittype, data_dir):
  """Saves transformed model."""
  """Saves transformed model."""


  samples_dir = os.path.join(data_dir, "samples")
  samples_dir = os.path.join(data_dir, "samples")
  samples = FeaturizedSamples(samples_dir, reload=True)
  samples = FeaturizedSamples(samples_dir, reload_data=True)


  print("Split data into train/test")
  print("Split data into train/test")
  train_samples_dir = os.path.join(data_dir, "train-samples")
  train_samples_dir = os.path.join(data_dir, "train-samples")
  test_samples_dir = os.path.join(data_dir, "test-samples")
  test_samples_dir = os.path.join(data_dir, "test-samples")
  train_samples, test_samples = samples.train_test_split(splittype,
  train_samples, test_samples = samples.train_test_split(
    train_samples_dir, test_samples_dir)
      splittype, train_samples_dir, test_samples_dir)


  train_data_dir = os.path.join(data_dir, "train-data")
  train_data_dir = os.path.join(data_dir, "train-data")
  test_data_dir = os.path.join(data_dir, "test-data")
  test_data_dir = os.path.join(data_dir, "test-data")
@@ -366,7 +363,7 @@ def eval_trained_model(model_type, model_dir, data_dir,
  data = Dataset(data_dir)
  data = Dataset(data_dir)


  evaluator = Evaluator(model, data, verbose=True)
  evaluator = Evaluator(model, data, verbose=True)
  pred_y_df, perf_df = evaluator.compute_model_performance(csv_out, stats_out)
  _, perf_df = evaluator.compute_model_performance(csv_out, stats_out)
  print("Model Performance.")
  print("Model Performance.")
  print(perf_df)
  print(perf_df)


+53 −39
Original line number Original line Diff line number Diff line
@@ -7,8 +7,8 @@ from __future__ import unicode_literals
import os
import os
import numpy as np
import numpy as np
import pandas as pd
import pandas as pd
import multiprocessing as mp
from functools import partial
from functools import partial
from rdkit import Chem
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import save_to_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.save import load_from_disk
from deepchem.utils.featurize import FeaturizedSamples
from deepchem.utils.featurize import FeaturizedSamples
@@ -110,8 +110,7 @@ class Dataset(object):
    """
    """
    Iterates over all shards in dataset.
    Iterates over all shards in dataset.
    """
    """
    nb_shards = self.get_number_shards()
    for _, row in self.metadata_df.iterrows():
    for i, row in self.metadata_df.iterrows():
      X = load_from_disk(row['X-transformed'])
      X = load_from_disk(row['X-transformed'])
      y = load_from_disk(row['y-transformed'])
      y = load_from_disk(row['y-transformed'])
      w = load_from_disk(row['w'])
      w = load_from_disk(row['w'])
@@ -119,8 +118,13 @@ class Dataset(object):
      yield (X, y, w, ids)
      yield (X, y, w, ids)


  def transform(self, input_transforms, output_transforms, parallel=False):
  def transform(self, input_transforms, output_transforms, parallel=False):
    (normalize_X, truncate_x, normalize_y, 
    """
        truncate_y, log_X, log_y) = False, False, False, False, False, False
    Transforms all internally stored data.

    Adds X-transform, y-transform columns to metadata.
    """
    (normalize_X, truncate_x, normalize_y, truncate_y, log_X, log_y) = (
        False, False, False, False, False, False)


    if "truncate" in input_transforms:
    if "truncate" in input_transforms:
      truncate_x = True
      truncate_x = True
@@ -141,10 +145,12 @@ class Dataset(object):
                                                       log_X, log_y,
                                                       log_X, log_y,
                                                       parallel=parallel)
                                                       parallel=parallel)
    nrow = self.metadata_df.shape[0]
    nrow = self.metadata_df.shape[0]
    self.metadata_df['X_means'] = [X_means for i in range(nrow)]
    # TODO(rbharath): These lines are puzzling. Better way to avoid storage
    self.metadata_df['X_stds'] = [X_stds for i in range(nrow)]
    # duplication here?
    self.metadata_df['y_means'] = [y_means for i in range(nrow)]
    self.metadata_df['X_means'] = [X_means for _ in range(nrow)]
    self.metadata_df['y_stds'] = [y_stds for i in range(nrow)]
    self.metadata_df['X_stds'] = [X_stds for _ in range(nrow)]
    self.metadata_df['y_means'] = [y_means for _ in range(nrow)]
    self.metadata_df['y_stds'] = [y_stds for _ in range(nrow)]
    save_to_disk(
    save_to_disk(
        self.metadata_df, self._get_metadata_filename())
        self.metadata_df, self._get_metadata_filename())
    self.transforms = (input_transforms, output_transforms)
    self.transforms = (input_transforms, output_transforms)
@@ -152,26 +158,30 @@ class Dataset(object):
        self.transforms, self._get_transforms_filename())
        self.transforms, self._get_transforms_filename())


  def get_label_means(self):
  def get_label_means(self):
    """Return pandas series of label means."""
    return self.metadata_df["y_means"]
    return self.metadata_df["y_means"]


  def get_label_stds(self):
  def get_label_stds(self):
    """Return pandas series of label stds."""
    return self.metadata_df["y_stds"]
    return self.metadata_df["y_stds"]


  def get_input_transforms(self):
  def get_input_transforms(self):
    """Returns stored input transforms."""
    (input_transforms, _) = self.transforms
    (input_transforms, _) = self.transforms
    return input_transforms
    return input_transforms


  def get_output_transforms(self):
  def get_output_transforms(self):
    """Returns stored output transforms."""
    (_, output_transforms) = self.transforms
    (_, output_transforms) = self.transforms
    return output_transforms
    return output_transforms


  def _transform(self, normalize_X=True, normalize_y=True,
  def _transform(self, normalize_X=True, normalize_y=True,
                 truncate_X=True, truncate_y=True,
                 truncate_X=True, truncate_y=True,
                 log_X=False, log_y=False, parallel=False):
                 log_X=False, log_y=False, parallel=False):
    """Helper to (parallel) transform all indexed data."""
    df = self.metadata_df
    df = self.metadata_df
    trunc = 5.0
    trunc = 5.0
    X_means, X_stds, y_means, y_stds = compute_mean_and_std(df)
    X_means, X_stds, y_means, y_stds = compute_mean_and_std(df)
    total = df.shape[0]
    indices = range(0, df.shape[0])
    indices = range(0, df.shape[0])
    transform_row_partial = partial(_transform_row, df=df, normalize_X=normalize_X,
    transform_row_partial = partial(_transform_row, df=df, normalize_X=normalize_X,
                                    normalize_y=normalize_y, truncate_X=truncate_X,
                                    normalize_y=normalize_y, truncate_X=truncate_X,
@@ -190,7 +200,11 @@ class Dataset(object):


def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y,
def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y,
                   log_X, log_y, X_means, X_stds, y_means, y_stds, trunc):
                   log_X, log_y, X_means, X_stds, y_means, y_stds, trunc):
  total = df.shape[0]
  """
  Transforms the data (X, y, w,...) in a single row.

  Writes X-transforme,d y-transformed to disk.
  """
  row = df.iloc[i]
  row = df.iloc[i]
  X = load_from_disk(row['X'])
  X = load_from_disk(row['X'])
  if normalize_X or log_X:
  if normalize_X or log_X:
@@ -205,7 +219,6 @@ def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y,
  save_to_disk(X, row['X-transformed'])
  save_to_disk(X, row['X-transformed'])


  y = load_from_disk(row['y'])
  y = load_from_disk(row['y'])
  w = load_from_disk(row['w'])
  if normalize_y or log_y:
  if normalize_y or log_y:
    if normalize_y:
    if normalize_y:
      y = np.nan_to_num((y - y_means) / y_stds)
      y = np.nan_to_num((y - y_means) / y_stds)
@@ -249,6 +262,7 @@ def compute_sums_and_nb_sample(tensor, W=None):
# make it easy to use multiprocessing.
# make it easy to use multiprocessing.


def write_dataset_single(val, data_dir, feature_types):
def write_dataset_single(val, data_dir, feature_types):
  """Writes files for single row (X, y, w, X-transformed, ...) to disk."""
  (df_file, df) = val
  (df_file, df) = val
  # TODO(rbharath): This is a hack. clean up.
  # TODO(rbharath): This is a hack. clean up.
  if not len(df):
  if not len(df):
@@ -290,7 +304,7 @@ def _df_to_numpy(df, feature_types):
  y = np.reshape(y, (n_samples, n_tasks))
  y = np.reshape(y, (n_samples, n_tasks))
  w = np.ones((n_samples, n_tasks))
  w = np.ones((n_samples, n_tasks))
  tensors = []
  tensors = []
  for i, datapoint in df.iterrows():
  for _, datapoint in df.iterrows():
    feature_list = []
    feature_list = []
    for feature_type in feature_types:
    for feature_type in feature_types:
      feature_list.append(datapoint[feature_type])
      feature_list.append(datapoint[feature_type])
Loading