Commit 1f95554a authored by Ubuntu's avatar Ubuntu
Browse files

variety of optimizations and additions

parent 9569295e
Loading
Loading
Loading
Loading
+22 −0
Original line number Diff line number Diff line
@@ -79,6 +79,28 @@ def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
  features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
  return np.squeeze(np.array(features)), valid_inds

def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True):
  """Featurize individual compounds in a numpy array.

  Given a featurizer that operates on individual chemical compounds
  or macromolecules, compute & add features for that compound to the
  features array
  """
  features = []
  for ind, elem in enumerate(arr.tolist()):
    mol = Chem.MolFromSmiles(elem)
    if mol:
      new_order = rdmolfiles.CanonicalRankAtoms(mol)
      mol = rdmolops.RenumberAtoms(mol, new_order)
    if ind % log_every_N == 0:
      log("Featurizing sample %d" % ind, verbose)
    features.append(featurizer.featurize([mol]))

  valid_inds = np.array(
      [1 if elt.size > 0 else 0 for elt in features], dtype=bool)
  features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
  features = np.squeeze(np.array(features))
  return features.reshape(-1,)

def get_user_specified_features(df, featurizer, verbose=True):
  """Extract and merge user specified features. 
+2 −3
Original line number Diff line number Diff line
@@ -246,7 +246,7 @@ class Dataset(object):
class NumpyDataset(Dataset):
  """A Dataset defined by in-memory numpy arrays."""

  def __init__(self, X, y=None, w=None, ids=None):
  def __init__(self, X, y=None, w=None, ids=None, n_tasks=1):
    n_samples = len(X)
    # The -1 indicates that y will be reshaped to have length -1
    if n_samples > 0:
@@ -256,9 +256,8 @@ class NumpyDataset(Dataset):
          w = np.reshape(w, (n_samples, -1))
      else:
        # Set labels to be zero, with zero weights
        y = np.zeros((n_samples, 1))
        y = np.zeros((n_samples, 1*n_tasks))
        w = np.zeros_like(y)
    n_tasks = y.shape[1]
    if ids is None:
      ids = np.arange(n_samples)
    if w is None:
+3 −0
Original line number Diff line number Diff line
@@ -71,6 +71,9 @@ def pearson_r2_score(y, y_pred):
  """Computes Pearson R^2 (square of Pearson correlation)."""
  return pearsonr(y, y_pred)[0]**2

def r2_score(y, y_pred):
  """Computes R^2 using coefficient of determination (can be negative)."""
  return r2_score(y, y_pred)

def prc_auc_score(y, y_pred):
  """Compute area under precision-recall curve"""
+31 −1
Original line number Diff line number Diff line
@@ -2,6 +2,10 @@ import numpy as np
import six
import tensorflow as tf

from deepchem.feat.graph_features import ConvMolFeaturizer
from deepchem.data.data_loader import featurize_smiles_np
from deepchem.data import NumpyDataset

from deepchem.feat.mol_graphs import ConvMol
from deepchem.metrics import to_one_hot, from_one_hot
from deepchem.models.tensorgraph.graph_layers import WeaveLayer, WeaveGather, \
@@ -608,7 +612,7 @@ class GraphConvTensorGraph(TensorGraph):
    batch_norm2 = BatchNorm(in_layers=[gc2])
    gp2 = GraphPool(in_layers=[batch_norm2, self.degree_slice, self.membership]
                    + self.deg_adjs)
    dense = Dense(out_channels=128, activation_fn=None, in_layers=[gp2])
    dense = Dense(out_channels=128, activation_fn=tf.nn.relu, in_layers=[gp2])
    batch_norm3 = BatchNorm(in_layers=[dense])
    gg1 = GraphGather(
        batch_size=self.batch_size,
@@ -712,3 +716,29 @@ class GraphConvTensorGraph(TensorGraph):
        metrics,
        labels=self.my_labels,
        weights=[self.my_task_weights])

  def predict_on_smiles(self, smiles, transformers):
    max_index = len(smiles)
    num_batches = max_index / self.batch_size

    y_ = []
    for i in range(num_batches):
      smiles_batch = smiles[i*self.batch_size:(i+1)*self.batch_size]
      y_.append(self.predict_on_smiles_batch(smiles_batch, transformers))
    smiles_batch = smiles[num_batches*self.batch_size:max_index]
    y_.append(self.predict_on_smiles_batch(smiles_batch, transformers))

    return np.concatenate(y_, axis=1)
  
  def predict_on_smiles_batch(self, smiles, transformers=[]):
    featurizer = ConvMolFeaturizer()
    convmols = featurize_smiles_np(smiles, featurizer)

    n_smiles = convmols.shape[0]
    n_tasks = len(self.outputs)
    
    dataset = NumpyDataset(X=convmols, y=None, n_tasks=n_tasks)
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    y_ = self.predict_on_generator(generator, transformers)

    return y_.reshape(-1, n_tasks)[:n_smiles]