Unverified Commit 9c879e45 authored by Daiki Nishikawa's avatar Daiki Nishikawa Committed by GitHub
Browse files

Merge pull request #2098 from nd-02110114/metrics

Add type annotation in metrics 
parents 6681561e 30422b81
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@ Docks Molecular Complexes
"""
import logging
import tempfile
from typing import cast, Generator, Optional, Tuple, Union
from typing import Generator, Optional, Tuple, Union
import numpy as np

from deepchem.utils.typing import RDKitMol
@@ -128,8 +128,8 @@ class Docker(object):
    # We know use_pose_generator_scores == False in this case
    if self.scoring_model is not None:
      for posed_complex in complexes:
        # NOTE: this casting is workaround. This line doesn't effect anything to the runtime
        self.featurizer = cast(ComplexFeaturizer, self.featurizer)
        # check whether self.featurizer is instance of ComplexFeaturizer or not
        assert isinstance(self.featurizer, ComplexFeaturizer)
        # TODO: How to handle the failure here?
        (protein_file, ligand_file) = molecular_complex
        features, _ = self.featurizer.featurize([protein_file], [ligand_file])
+39 −868

File changed.

Preview size limit exceeded, changes collapsed.

+59 −35
Original line number Diff line number Diff line
"""Evaluation Metrics for Genomics Datasets."""

from typing import List, Optional
import numpy as np
from deepchem.data import NumpyDataset
from scipy.signal import correlate2d

from deepchem.models import Model
from deepchem.data import NumpyDataset


def get_motif_scores(encoded_sequences,
                     motif_names,
                     max_scores=None,
                     return_positions=False,
                     GC_fraction=0.4):
def get_motif_scores(encoded_sequences: np.ndarray,
                     motif_names: List[str],
                     max_scores: Optional[int] = None,
                     return_positions: bool = False,
                     GC_fraction: float = 0.4) -> np.ndarray:
  """Computes pwm log odds.

  Parameters
  ----------
  encoded_sequences : 4darray
       (N_sequences, N_letters, sequence_length, 1) array
  motif_names : list of strings
  encoded_sequences: np.ndarray
    A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`.
  motif_names: List[str]
    List of motif file names.
  max_scores: int, optional
  return_positions : boolean, optional
  GC_fraction : float, optional
    Get top `max_scores` scores.
  return_positions: bool, default False
    Whether to return postions or not.
  GC_fraction: float, default 0.4
    GC fraction in background sequence.

  Returns
  -------
  (N_sequences, num_motifs, seq_length) complete score array by default.
  If max_scores, (N_sequences, num_motifs*max_scores) max score array.
  If max_scores and return_positions, (N_sequences, 2*num_motifs*max_scores)
  array with max scores and their positions.
  np.ndarray
    A numpy array of complete score. The shape is `(N_sequences, num_motifs, seq_length)` by default.
    If max_scores, the shape of score array is `(N_sequences, num_motifs*max_scores)`.
    If max_scores and return_positions, the shape of score array with max scores and their positions.
    is `(N_sequences, 2*num_motifs*max_scores)`.

  Notes
  -----
  This method requires simdna to be installed.
  """
  try:
    import simdna
    from simdna import synthetic
  except ModuleNotFoundError:
    raise ValueError("This function requires simdna to be installed.")

  loaded_motifs = synthetic.LoadedEncodeMotifs(
      simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)
  num_samples, _, seq_length, _ = encoded_sequences.shape
@@ -59,22 +75,23 @@ def get_motif_scores(encoded_sequences,
    return scores


def get_pssm_scores(encoded_sequences, pssm):
def get_pssm_scores(encoded_sequences: np.ndarray,
                    pssm: np.ndarray) -> np.ndarray:
  """
  Convolves pssm and its reverse complement with encoded sequences
  and returns the maximum score at each position of each sequence.

  Parameters
  ----------
  encoded_sequences: 3darray
       (N_sequences, N_letters, sequence_length, 1) array
  pssm: 2darray
      (4, pssm_length) array
  encoded_sequences: np.ndarray
    A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`.
  pssm: np.ndarray
    A numpy array of shape `(4, pssm_length)`.

  Returns
  -------
  scores: 2darray
      (N_sequences, sequence_length)
  scores: np.ndarray
    A numpy array of shape `(N_sequences, sequence_length)`.
  """
  encoded_sequences = encoded_sequences.squeeze(axis=3)
  # initialize fwd and reverse scores to -infinity
@@ -97,31 +114,36 @@ def get_pssm_scores(encoded_sequences, pssm):
  return scores


def in_silico_mutagenesis(model, X):
def in_silico_mutagenesis(model: Model,
                          encoded_sequences: np.ndarray) -> np.ndarray:
  """Computes in-silico-mutagenesis scores

  Parameters
  ----------
  model: Model
    This can be any model that accepts inputs of the required shape and produces
    an output of shape (N_sequences, N_tasks).
  X: ndarray
    Shape (N_sequences, N_letters, sequence_length, 1)
    an output of shape `(N_sequences, N_tasks)`.
  encoded_sequences: np.ndarray
    A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`

  Returns
  -------
  (num_task, N_sequences, N_letters, sequence_length, 1) ISM score array.
  np.ndarray
    A numpy array of ISM scores. The shape is `(num_task, N_sequences, N_letters, sequence_length, 1)`.
  """
  # Shape (N_sequences, num_tasks)
  wild_type_predictions = model.predict(NumpyDataset(X))
  wild_type_predictions = model.predict(NumpyDataset(encoded_sequences))
  # check whether wild_type_predictions is np.ndarray or not
  assert isinstance(wild_type_predictions, np.ndarray)
  num_tasks = wild_type_predictions.shape[1]
  # Shape (N_sequences, N_letters, sequence_length, 1, num_tasks)
  mutagenesis_scores = np.empty(X.shape + (num_tasks,), dtype=np.float32)
  mutagenesis_scores = np.empty(
      encoded_sequences.shape + (num_tasks,), dtype=np.float32)
  # Shape (N_sequences, num_tasks, 1, 1, 1)
  wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis,
                                                np.newaxis]
  for sequence_index, (sequence, wild_type_prediction) in enumerate(
      zip(X, wild_type_predictions)):
      zip(encoded_sequences, wild_type_predictions)):

    # Mutates every position of the sequence to every letter
    # Shape (N_letters * sequence_length, N_letters, sequence_length, 1)
@@ -142,6 +164,8 @@ def in_silico_mutagenesis(model, X):
    mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1
    # make mutant predictions
    mutated_predictions = model.predict(NumpyDataset(mutated_sequences))
    # check whether wild_type_predictions is np.ndarray or not
    assert isinstance(mutated_predictions, np.ndarray)
    mutated_predictions = mutated_predictions.reshape(sequence.shape +
                                                      (num_tasks,))
    mutagenesis_scores[
+740 −0

File added.

Preview size limit exceeded, changes collapsed.

+164 −0
Original line number Diff line number Diff line
"""Evaluation metrics."""

import numpy as np
from sklearn.metrics import matthews_corrcoef  # noqa
from sklearn.metrics import recall_score  # noqa
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import r2_score  # noqa
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score  # noqa
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score  # noqa
from sklearn.metrics import accuracy_score  # noqa
from sklearn.metrics import balanced_accuracy_score  # noqa
from scipy.stats import pearsonr

# kappa_score is an alias for `sklearn.metrics.cohen_kappa_score`
kappa_score = cohen_kappa_score


def pearson_r2_score(y: np.ndarray, y_pred: np.ndarray) -> float:
  """Computes Pearson R^2 (square of Pearson correlation).

  Parameters
  ----------
  y: np.ndarray
    ground truth array
  y_pred: np.ndarray
    predicted array

  Returns
  -------
  float
    The Pearson-R^2 score.
  """
  return pearsonr(y, y_pred)[0]**2


def jaccard_index(y: np.ndarray, y_pred: np.ndarray) -> float:
  """Computes Jaccard Index which is the Intersection Over Union metric
  which is commonly used in image segmentation tasks.

  DEPRECATED: WILL BE REMOVED IN A FUTURE VERSION OF DEEEPCHEM. USE `jaccard_score` instead.

  Parameters
  ----------
  y: np.ndarray
    ground truth array
  y_pred: np.ndarray
    predicted array

  Returns
  -------
  score: float
    The jaccard index. A number between 0 and 1.
  """
  return jaccard_score(y, y_pred)


def pixel_error(y: np.ndarray, y_pred: np.ndarray) -> float:
  """An error metric in case y, y_pred are images.

  Defined as 1 - the maximal F-score of pixel similarity, or squared
  Euclidean distance between the original and the result labels.

  Parameters
  ----------
  y: np.ndarray
    ground truth array
  y_pred: np.ndarray
    predicted array

  Returns
  -------
  score: float
    The pixel-error. A number between 0 and 1.
  """
  return 1 - f1_score(y, y_pred)


def prc_auc_score(y: np.ndarray, y_pred: np.ndarray) -> float:
  """Compute area under precision-recall curve

  Parameters
  ----------
  y: np.ndarray
    A numpy array of shape `(N, n_classes)` or `(N,)` with true labels
  y_pred: np.ndarray
    Of shape `(N, n_classes)` with class probabilities.

  Returns
  -------
  float
    The area under the precision-recall curve. A number between 0 and 1.
  """
  precision, recall, _ = precision_recall_curve(y[:, 1], y_pred[:, 1])
  return auc(recall, precision)


def rms_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
  """Computes RMS error."""
  return np.sqrt(mean_squared_error(y_true, y_pred))


def mae_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
  """Computes MAE."""
  return mean_absolute_error(y_true, y_pred)


def bedroc_score(y_true: np.ndarray, y_pred: np.ndarray, alpha: float = 20.0):
  """Compute BEDROC metric.

  BEDROC metric implemented according to Truchon and Bayley that modifies
  the ROC score by allowing for a factor of early recognition.
  Please confirm details from [1]_.

  Parameters
  ----------
  y_true: np.ndarray
    Binary class labels. 1 for positive class, 0 otherwise
  y_pred: np.ndarray
    Predicted labels
  alpha: float, default 20.0
    Early recognition parameter

  Returns
  -------
  float
    Value in [0, 1] that indicates the degree of early recognition

  Notes
  -----
  This function requires RDKit to be installed.

  References
  ----------
  .. [1] Truchon et al. "Evaluating virtual screening methods: good and bad metrics
     for the “early recognition” problem." Journal of chemical information and modeling
     47.2 (2007): 488-508.
  """
  try:
    from rdkit.ML.Scoring.Scoring import CalcBEDROC
  except ModuleNotFoundError:
    raise ValueError("This function requires RDKit to be installed.")

  # validation
  assert len(y_true) == len(y_pred), 'Number of examples do not match'
  assert np.array_equal(
      np.unique(y_true).astype(int),
      [0, 1]), ('Class labels must be binary: %s' % np.unique(y_true))

  yt = np.asarray(y_true)
  yp = np.asarray(y_pred)

  yt = yt.flatten()
  yp = yp[:, 1].flatten()  # Index 1 because one_hot predictions

  scores = list(zip(yt, yp))
  scores = sorted(scores, key=lambda pair: pair[1], reverse=True)

  return CalcBEDROC(scores, 0, alpha)
Loading