Unverified Commit b0ccc4a5 authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1322 from rbharath/genomic_metrics

Genomic metrics
parents c521c1c9 d872861b
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@ Imports all submodules
from __future__ import division
from __future__ import unicode_literals

__version__ = '2.1.0'
__version__ = '2.1.1'

import deepchem.data
import deepchem.feat
+145 −0
Original line number Diff line number Diff line
"""Evaluation Metrics for Genomics Datasets."""

import numpy as np
from deepchem.data import NumpyDataset
from deepchem.utils.genomics import loaded_motifs
from scipy.signal import correlate2d


def get_motif_scores(encoded_sequences,
                     motif_names,
                     max_scores=None,
                     return_positions=False,
                     GC_fraction=0.4):
  """Computes pwm log odds.

  Parameters
  ----------
  encoded_sequences : 4darray
       (N_sequences, N_letters, sequence_length, 1) array
  motif_names : list of strings
  max_scores : int, optional
  return_positions : boolean, optional
  GC_fraction : float, optional

  Returns
  -------
  (N_sequences, num_motifs, seq_length) complete score array by default.
  If max_scores, (N_sequences, num_motifs*max_scores) max score array.
  If max_scores and return_positions, (N_sequences, 2*num_motifs*max_scores)
  array with max scores and their positions.
  """
  num_samples, _, seq_length, _ = encoded_sequences.shape
  scores = np.ones((num_samples, len(motif_names), seq_length))
  for j, motif_name in enumerate(motif_names):
    pwm = loaded_motifs.getPwm(motif_name).getRows().T
    log_pwm = np.log(pwm)
    gc_pwm = 0.5 * np.array(
        [[1 - GC_fraction, GC_fraction, GC_fraction, 1 - GC_fraction]] * len(
            pwm[0])).T
    gc_log_pwm = np.log(gc_pwm)
    log_scores = get_pssm_scores(encoded_sequences, log_pwm)
    gc_log_scores = get_pssm_scores(encoded_sequences, gc_log_pwm)
    scores[:, j, :] = log_scores - gc_log_scores
  if max_scores is not None:
    sorted_scores = np.sort(scores)[:, :, ::-1][:, :, :max_scores]
    if return_positions:
      sorted_positions = scores.argsort()[:, :, ::-1][:, :, :max_scores]
      return np.concatenate(
          (sorted_scores.reshape((num_samples, len(motif_names) * max_scores)),
           sorted_positions.reshape(
               (num_samples, len(motif_names) * max_scores))),
          axis=1)
    else:
      return sorted_scores.reshape((num_samples, len(motif_names) * max_scores))
  else:
    return scores


def get_pssm_scores(encoded_sequences, pssm):
  """
  Convolves pssm and its reverse complement with encoded sequences
  and returns the maximum score at each position of each sequence.

  Parameters
  ----------
  encoded_sequences: 3darray
       (N_sequences, N_letters, sequence_length, 1) array
  pssm: 2darray
      (4, pssm_length) array

  Returns
  -------
  scores: 2darray
      (N_sequences, sequence_length)
  """
  encoded_sequences = encoded_sequences.squeeze(axis=3)
  # initialize fwd and reverse scores to -infinity
  fwd_scores = np.full_like(encoded_sequences, -np.inf, float)
  rc_scores = np.full_like(encoded_sequences, -np.inf, float)
  # cross-correlate separately for each base,
  # for both the PSSM and its reverse complement
  for base_indx in range(encoded_sequences.shape[1]):
    base_pssm = pssm[base_indx][None]
    base_pssm_rc = base_pssm[:, ::-1]
    fwd_scores[:, base_indx, :] = correlate2d(
        encoded_sequences[:, base_indx, :], base_pssm, mode='same')
    rc_scores[:, base_indx, :] = correlate2d(
        encoded_sequences[:, -(base_indx + 1), :], base_pssm_rc, mode='same')
  # sum over the bases
  fwd_scores = fwd_scores.sum(axis=1)
  rc_scores = rc_scores.sum(axis=1)
  # take max of fwd and reverse scores at each position
  scores = np.maximum(fwd_scores, rc_scores)
  return scores


def in_silico_mutagenesis(model, X):
  """Computes in-silico-mutagenesis scores
   
  Parameters
  ----------
  model: TensorGraph
    Currently only SequenceDNN will work, but other models may be added.
  X: ndarray
    Shape (N_sequences, N_letters, sequence_length, 1) 

  Returns
  -------
  (num_task, N_sequences, N_letters, sequence_length, 1) ISM score array.
  """
  #Shape (N_sequences, N_letters, sequence_length, 1, num_tasks)
  mutagenesis_scores = np.empty(X.shape + (model.num_tasks,), dtype=np.float32)
  # Shape (N_sequences, num_tasks)
  wild_type_predictions = model.predict(NumpyDataset(X))
  # Shape (N_sequences, num_tasks, 1, 1, 1)
  wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis,
                                                np.newaxis]
  for sequence_index, (sequence, wild_type_prediction) in enumerate(
      zip(X, wild_type_predictions)):

    # Mutates every position of the sequence to every letter
    # Shape (N_letters * sequence_length, N_letters, sequence_length, 1)
    # Breakdown:
    #  Shape of sequence[np.newaxis] (1, N_letters, sequence_length, 1)
    mutated_sequences = np.repeat(
        sequence[np.newaxis], np.prod(sequence.shape), axis=0)

    # remove wild-type
    # len(arange) = N_letters * sequence_length
    arange = np.arange(len(mutated_sequences))
    # len(horizontal cycle) = N_letters * sequence_length
    horizontal_cycle = np.tile(np.arange(sequence.shape[1]), sequence.shape[0])
    mutated_sequences[arange, :, horizontal_cycle, :] = 0

    # add mutant
    vertical_repeat = np.repeat(np.arange(sequence.shape[0]), sequence.shape[1])
    mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1
    # make mutant predictions
    mutated_predictions = model.predict(NumpyDataset(mutated_sequences))
    mutated_predictions = mutated_predictions.reshape(sequence.shape +
                                                      (model.num_tasks,))
    mutagenesis_scores[
        sequence_index] = wild_type_prediction - mutated_predictions
  rolled_scores = np.rollaxis(mutagenesis_scores, -1)
  return rolled_scores
+92 −0
Original line number Diff line number Diff line
"""
Test that genomic metrics work.
"""
from __future__ import division
from __future__ import unicode_literals

import unittest
import os

import numpy as np
import deepchem as dc

LETTERS = "ACGT"

from deepchem.metrics.genomic_metrics import get_motif_scores
from deepchem.metrics.genomic_metrics import get_pssm_scores
from deepchem.metrics.genomic_metrics import in_silico_mutagenesis


class TestGenomicMetrics(unittest.TestCase):
  """
  Tests that genomic metrics work as expected. 
  """

  def test_get_motif_scores(self):
    """Check that motif_scores have correct shape."""
    # Encode motif
    motif_name = "TAL1_known4"
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
    # sequences now has shape (3, 4, 5, 1)
    self.assertEqual(sequences.shape, (3, 4, 5, 1))

    motif_scores = get_motif_scores(sequences, [motif_name])
    self.assertEqual(motif_scores.shape, (3, 1, 5))

  def test_get_pssm_scores(self):
    """Test get_pssm_scores returns correct shape."""
    motif_name = "TAL1_known4"
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
    # sequences now has shape (3, 4, 5, 1)
    self.assertEqual(sequences.shape, (3, 4, 5, 1))
    pssm = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])

    pssm_scores = get_pssm_scores(sequences, pssm)
    self.assertEqual(pssm_scores.shape, (3, 5))

  def test_in_silico_mutagenesis_shape(self):
    """Test in-silico mutagenesis returns correct shape."""
    # Construct and train SequenceDNN model
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
    labels = np.array([1, 0, 0])
    labels = np.reshape(labels, (3, 1))
    self.assertEqual(sequences.shape, (3, 4, 5, 1))

    #X = np.random.rand(10, 1, 4, 50)
    #y = np.random.randint(0, 2, size=(10, 1))
    #dataset = dc.data.NumpyDataset(X, y)
    dataset = dc.data.NumpyDataset(sequences, labels)
    model = dc.models.SequenceDNN(
        5, "binary_crossentropy", num_filters=[1, 1], kernel_size=[15, 15])
    model.fit(dataset, nb_epoch=1)

    # Call in-silico mutagenesis
    mutagenesis_scores = in_silico_mutagenesis(model, sequences)
    self.assertEqual(mutagenesis_scores.shape, (1, 3, 4, 5, 1))

  def test_in_silico_mutagenesis_nonzero(self):
    """Test in-silico mutagenesis returns nonzero output."""
    # Construct and train SequenceDNN model
    sequences = np.array(["ACGTA", "GATAG", "CGCGC"])
    sequences = dc.utils.save.seq_one_hot_encode(sequences, letters=LETTERS)
    labels = np.array([1, 0, 0])
    labels = np.reshape(labels, (3, 1))
    self.assertEqual(sequences.shape, (3, 4, 5, 1))

    #X = np.random.rand(10, 1, 4, 50)
    #y = np.random.randint(0, 2, size=(10, 1))
    #dataset = dc.data.NumpyDataset(X, y)
    dataset = dc.data.NumpyDataset(sequences, labels)
    model = dc.models.SequenceDNN(
        5, "binary_crossentropy", num_filters=[1, 1], kernel_size=[15, 15])
    model.fit(dataset, nb_epoch=1)

    # Call in-silico mutagenesis
    mutagenesis_scores = in_silico_mutagenesis(model, sequences)
    self.assertEqual(mutagenesis_scores.shape, (1, 3, 4, 5, 1))

    # Check nonzero elements exist
    assert np.count_nonzero(mutagenesis_scores) > 0
+8 −0
Original line number Diff line number Diff line
"""
Genomic data handling utilities.
"""
import simdna
from simdna.synthetic import LoadedEncodeMotifs

loaded_motifs = LoadedEncodeMotifs(
    simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)
+1 −1
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@ author = DeepChem contributors
summary = Deep-learning models for drug discovery, quantum chemistry, and the life sciences.
home-page = https://github.com/deepchem/deepchem
license = MIT
version = 2.1.0
version = 2.1.1
classifier =
    Development Status :: 4 - Beta
    Environment :: Console