Commit 21d20383 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

Local changes

parent 77907938
Loading
Loading
Loading
Loading
+148 −1170

File changed.

Preview size limit exceeded, changes collapsed.

+367 −0
Original line number Diff line number Diff line
from __future__ import absolute_import, division, print_function
from collections import OrderedDict
import numpy as np
import simdna
from simdna.synthetic import (
    RepeatedEmbedder, SubstringEmbedder, ReverseComplementWrapper,
    UniformPositionGenerator, InsideCentralBp,
    LoadedEncodeMotifs, PwmSamplerFromLoadedMotifs,
    UniformIntegerGenerator, ZeroOrderBackgroundGenerator,
    EmbedInABackground, GenerateSequenceNTimes,
    RandomSubsetOfEmbedders, IsInTraceLabelGenerator,
    EmbeddableEmbedder, PairEmbeddableGenerator,
)
from simdna.util import DiscreteDistribution

loaded_motifs = LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH,
                                   pseudocountProb=0.001)


def get_distribution(GC_fraction):
    return DiscreteDistribution({
         'A': (1 - GC_fraction) / 2, 'C': GC_fraction / 2,
         'G': GC_fraction / 2, 'T': (1 - GC_fraction) / 2})


def simple_motif_embedding(motif_name, seq_length, num_seqs, GC_fraction):
    """
    Simulates sequences with a motif embedded anywhere in the sequence.

    Parameters
    ----------
    motif_name : str
        encode motif name
    seq_length : int
        length of sequence
    num_seqs: int
        number of sequences
    GC_fraction : float
        GC fraction in background sequence

    Returns
    -------
    sequence_arr : 1darray
        Array with sequence strings.
    embedding_arr: 1darray
        Array of embedding objects.
    """
    if motif_name is None:
        embedders = []
    else:
        substring_generator = PwmSamplerFromLoadedMotifs(
            loaded_motifs, motif_name)
        embedders = [SubstringEmbedder(
            ReverseComplementWrapper(substring_generator))]
    embed_in_background = EmbedInABackground(
        ZeroOrderBackgroundGenerator(
            seq_length, discreteDistribution=get_distribution(GC_fraction)),
        embedders)
    generated_sequences = tuple(GenerateSequenceNTimes(
        embed_in_background, num_seqs).generateSequences())
    sequence_arr = np.array([generated_seq.seq for generated_seq in generated_sequences])
    embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences]
    return sequence_arr, embedding_arr


def motif_density(motif_name, seq_length, num_seqs,
                  min_counts, max_counts, GC_fraction,
                  central_bp=None):
    """
    returns sequences with motif density, along with embeddings array.
    """
    substring_generator = PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name)
    if central_bp is not None:
        position_generator = InsideCentralBp(central_bp)
    else:
        position_generator = UniformPositionGenerator()
    quantity_generator = UniformIntegerGenerator(min_counts, max_counts)
    embedders = [
        RepeatedEmbedder(
            SubstringEmbedder(
                ReverseComplementWrapper(
                    substring_generator), position_generator),
            quantity_generator)]
    embed_in_background = EmbedInABackground(
        ZeroOrderBackgroundGenerator(
            seq_length, discreteDistribution=get_distribution(GC_fraction)),
        embedders)
    generated_sequences = tuple(GenerateSequenceNTimes(
        embed_in_background, num_seqs).generateSequences())
    sequence_arr = np.array([generated_seq.seq for generated_seq in generated_sequences])
    embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences]
    return sequence_arr, embedding_arr


def simulate_single_motif_detection(motif_name, seq_length,
                                    num_pos, num_neg, GC_fraction):
    """
    Simulates two classes of seqeuences:
        - Positive class sequence with a motif
          embedded anywhere in the sequence
        - Negative class sequence without the motif

    Parameters
    ----------
    motif_name : str
        encode motif name
    seq_length : int
        length of sequence
    num_pos : int
        number of positive class sequences
    num_neg : int
        number of negative class sequences
    GC_fraction : float
        GC fraction in background sequence

    Returns
    -------
    sequence_arr : 1darray
        Array with sequence strings.
    y : 1darray
        Array with positive/negative class labels.
    embedding_arr: 1darray
        Array of embedding objects.
    """
    motif_sequence_arr, positive_embedding_arr = simple_motif_embedding(
        motif_name, seq_length, num_pos, GC_fraction)
    random_sequence_arr, negative_embedding_arr = simple_motif_embedding(
        None, seq_length, num_neg, GC_fraction)
    sequence_arr = np.concatenate((motif_sequence_arr, random_sequence_arr))
    y = np.array([[True]] * num_pos + [[False]] * num_neg)
    embedding_arr = positive_embedding_arr + negative_embedding_arr
    return sequence_arr, y, embedding_arr


def simulate_motif_counting(motif_name, seq_length, pos_counts, neg_counts,
                            num_pos, num_neg, GC_fraction):
    """
    Generates data for motif counting task.

    Parameters
    ----------
    motif_name : str
    seq_length : int
    pos_counts : list
        (min_counts, max_counts) for positive set.
    neg_counts : list
        (min_counts, max_counts) for negative set.
    num_pos : int
    num_neg : int
    GC_fraction : float

    Returns
    -------
    sequence_arr : 1darray
        Contains sequence strings.
    y : 1darray
        Contains labels.
    embedding_arr: 1darray
        Array of embedding objects.
    """
    pos_count_sequence_array, positive_embedding_arr = motif_density(
        motif_name, seq_length, num_pos,
        pos_counts[0], pos_counts[1], GC_fraction)
    neg_count_sequence_array, negative_embedding_arr = motif_density(
        motif_name, seq_length, num_pos,
        neg_counts[0], neg_counts[1], GC_fraction)
    sequence_arr = np.concatenate(
        (pos_count_sequence_array, neg_count_sequence_array))
    y = np.array([[True]] * num_pos + [[False]] * num_neg)
    embedding_arr = positive_embedding_arr + negative_embedding_arr
    return sequence_arr, y, embedding_arr


def simulate_motif_density_localization(
        motif_name, seq_length, center_size, min_motif_counts,
        max_motif_counts, num_pos, num_neg, GC_fraction):
    """
    Simulates two classes of seqeuences:
        - Positive class sequences with multiple motif instances
          in center of the sequence.
        - Negative class sequences with multiple motif instances
          anywhere in the sequence.
    The number of motif instances is uniformly sampled
    between minimum and maximum motif counts.

    Parameters
    ----------
    motif_name : str
        encode motif name
    seq_length : int
        length of sequence
    center_size : int
        length of central part of the sequence where motifs can be positioned
    min_motif_counts : int
        minimum number of motif instances
    max_motif_counts : int
        maximum number of motif instances
    num_pos : int
        number of positive class sequences
    num_neg : int
        number of negative class sequences
    GC_fraction : float
        GC fraction in background sequence

    Returns
    -------
    sequence_arr : 1darray
        Contains sequence strings.
    y : 1darray
        Contains labels.
    embedding_arr: 1darray
        Array of embedding objects.
    """
    localized_density_sequence_array, positive_embedding_arr = motif_density(
        motif_name, seq_length, num_pos,
        min_motif_counts, max_motif_counts, GC_fraction, center_size)
    unlocalized_density_sequence_array, negative_embedding_arr = motif_density(
        motif_name, seq_length, num_neg,
        min_motif_counts, max_motif_counts, GC_fraction)
    sequence_arr = np.concatenate(
        (localized_density_sequence_array, unlocalized_density_sequence_array))
    y = np.array([[True]] * num_pos + [[False]] * num_neg)
    embedding_arr = positive_embedding_arr + negative_embedding_arr
    return sequence_arr, y, embedding_arr


def simulate_multi_motif_embedding(motif_names, seq_length, min_num_motifs,
                                   max_num_motifs, num_seqs, GC_fraction):
    """
    Generates data for multi motif recognition task.

    Parameters
    ----------
    motif_names : list
        List of strings.
    seq_length : int
    min_num_motifs : int
    max_num_motifs : int
    num_seqs : int
    GC_fraction : float

    Returns
    -------
    sequence_arr : 1darray
        Contains sequence strings.
    y : ndarray
        Contains labels for each motif.
    embedding_arr: 1darray
        Array of embedding objects.
    """

    def get_embedder(motif_name):
        substring_generator = PwmSamplerFromLoadedMotifs(
            loaded_motifs, motif_name)
        return SubstringEmbedder(
                ReverseComplementWrapper(substring_generator),
                name=motif_name)

    embedders = [get_embedder(motif_name) for motif_name in motif_names]
    quantity_generator = UniformIntegerGenerator(
        min_num_motifs, max_num_motifs)
    combined_embedder = [RandomSubsetOfEmbedders(
        quantity_generator, embedders)]
    embed_in_background = EmbedInABackground(
        ZeroOrderBackgroundGenerator(
            seq_length, discreteDistribution=get_distribution(GC_fraction)),
        combined_embedder)
    generated_sequences = tuple(GenerateSequenceNTimes(
        embed_in_background, num_seqs).generateSequences())
    sequence_arr = np.array([generated_seq.seq for generated_seq in generated_sequences])
    label_generator = IsInTraceLabelGenerator(np.asarray(motif_names))
    y = np.array([label_generator.generateLabels(generated_seq)
                  for generated_seq in generated_sequences], dtype=bool)
    embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences]
    return sequence_arr, y, embedding_arr


def simulate_differential_accessibility(
        pos_motif_names, neg_motif_names, seq_length,
        min_num_motifs, max_num_motifs, num_pos, num_neg, GC_fraction):
    """
    Generates data for differential accessibility task.

    Parameters
    ----------
    pos_motif_names : list
        List of strings.
    neg_motif_names : list
        List of strings.
    seq_length : int
    min_num_motifs : int
    max_num_motifs : int
    num_pos : int
    num_neg : int
    GC_fraction : float

    Returns
    -------
    sequence_arr : 1darray
        Contains sequence strings.
    y : 1darray
        Contains labels.
    embedding_arr: 1darray
        Array of embedding objects.
    """
    pos_motif_sequence_arr, _, positive_embedding_arr = simulate_multi_motif_embedding(
        pos_motif_names, seq_length,
        min_num_motifs, max_num_motifs, num_pos, GC_fraction)
    neg_motif_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
        neg_motif_names, seq_length,
        min_num_motifs, max_num_motifs, num_neg, GC_fraction)
    sequence_arr = np.concatenate(
        (pos_motif_sequence_arr, neg_motif_sequence_arr))
    y = np.array([[True]] * num_pos + [[False]] * num_neg)
    embedding_arr = positive_embedding_arr + negative_embedding_arr
    return sequence_arr, y, embedding_arr


def simulate_heterodimer_grammar(
        motif1, motif2, seq_length,
        min_spacing, max_spacing, num_pos, num_neg, GC_fraction):
    """
    Simulates two classes of sequences with motif1 and motif2:
        - Positive class sequences with motif1 and motif2 positioned
          min_spacing and max_spacing
        - Negative class sequences with independent motif1 and motif2 positioned
        anywhere in the sequence, not as a heterodimer grammar

    Parameters
    ----------
    seq_length : int, length of sequence
    GC_fraction : float, GC fraction in background sequence
    num_pos : int, number of positive class sequences
    num_neg : int, number of negatice class sequences
    motif1 : str, encode motif name
    motif2 : str, encode motif name
    min_spacing : int, minimum inter motif spacing
    max_spacing : int, maximum inter motif spacing

    Returns
    -------
    sequence_arr : 1darray
        Array with sequence strings.
    y : 1darray
        Array with positive/negative class labels.
    embedding_arr: list
        List of embedding objects.
    """

    motif1_generator = ReverseComplementWrapper(PwmSamplerFromLoadedMotifs(loaded_motifs, motif1))
    motif2_generator = ReverseComplementWrapper(PwmSamplerFromLoadedMotifs(loaded_motifs, motif2))
    separation_generator = UniformIntegerGenerator(min_spacing, max_spacing)
    embedder = EmbeddableEmbedder(PairEmbeddableGenerator(
        motif1_generator, motif2_generator, separation_generator))
    embed_in_background = EmbedInABackground(ZeroOrderBackgroundGenerator(
        seq_length, discreteDistribution=get_distribution(GC_fraction)), [embedder])
    generated_sequences = tuple(GenerateSequenceNTimes(
        embed_in_background, num_pos).generateSequences())
    grammar_sequence_arr = np.array([generated_seq.seq for generated_seq in generated_sequences])
    positive_embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences]
    nongrammar_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
        [motif1, motif2], seq_length, 2, 2, num_neg, GC_fraction)
    sequence_arr = np.concatenate(
        (grammar_sequence_arr, nongrammar_sequence_arr))
    y = np.array([[True]] * num_pos + [[False]] * num_neg)
    embedding_arr = positive_embedding_arr + negative_embedding_arr
    return sequence_arr, y, embedding_arr
+81 −67
Original line number Diff line number Diff line
@@ -6,7 +6,8 @@ from collections import namedtuple, defaultdict, OrderedDict
import numpy as np
np.random.seed(1)
from sklearn.model_selection import train_test_split
from simdna import simulations
#from simdna import simulations
import simulations
from simdna.synthetic import StringEmbeddable
from utils import get_motif_scores, one_hot_encode
from models import SequenceDNN
@@ -14,72 +15,85 @@ from dragonn.plot import add_letters_to_axis, plot_motif
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

Data = namedtuple(
    'Data',
    ('X_train', 'X_valid', 'X_test', 'train_embeddings', 'valid_embeddings',
     'test_embeddings', 'y_train', 'y_valid', 'y_test', 'motif_names'))


def get_available_simulations():
  return [
      function_name for function_name in dir(simulations)
      if "simulate" in function_name
  ]


def print_available_simulations():
  for function_name in get_available_simulations():
    print(function_name)


def get_simulation_function(simulation_name):
  if simulation_name in get_available_simulations():
    return getattr(simulations, simulation_name)
  else:
    print("%s is not available. Available simulations are:" % (simulation_name))
    print_available_simulations()


def print_simulation_info(simulation_name):
  simulation_function = get_simulation_function(simulation_name)
  if simulation_function is not None:
    print(simulation_function.__doc__)


def get_simulation_data(simulation_name,
                        simulation_parameters,
                        test_set_size=4000,
                        validation_set_size=3200):
  simulation_function = get_simulation_function(simulation_name)
  sequences, y, embeddings = simulation_function(**simulation_parameters)
  if simulation_name == "simulate_heterodimer_grammar":
    motif_names = [
        simulation_parameters["motif1"], simulation_parameters["motif2"]
    ]
  elif simulation_name == "simulate_multi_motif_embedding":
    motif_names = simulation_parameters["motif_names"]
  else:
    motif_names = [simulation_parameters["motif_name"]]

  train_sequences, test_sequences, train_embeddings, test_embeddings, y_train, y_test = \
      train_test_split(sequences, embeddings, y, test_size=test_set_size)
  train_sequences, valid_sequences, train_embeddings, valid_embeddings, y_train, y_valid = \
      train_test_split(train_sequences, train_embeddings, y_train, test_size=validation_set_size)
  X_train = one_hot_encode(train_sequences)
  X_valid = one_hot_encode(valid_sequences)
  X_test = one_hot_encode(test_sequences)

  return Data(X_train, X_valid, X_test, train_embeddings, valid_embeddings,
              test_embeddings, y_train, y_valid, y_test, motif_names)


def inspect_SequenceDNN():
  print(inspect.getdoc(SequenceDNN))
  print("\nAvailable methods:\n")
  for (method_name, _) in inspect.getmembers(
      SequenceDNN, predicate=inspect.ismethod):
    if method_name != "__init__":
      print(method_name)
#Data = namedtuple(
#    'Data',
#    ('X_train', 'X_valid', 'X_test', 'train_embeddings', 'valid_embeddings',
#     'test_embeddings', 'y_train', 'y_valid', 'y_test', 'motif_names'))


#def get_available_simulations():
#  return [
#      function_name for function_name in dir(simulations)
#      if "simulate" in function_name
#  ]


#def print_available_simulations():
#  for function_name in get_available_simulations():
#    print(function_name)


#def get_simulation_function(simulation_name):
#  if simulation_name in get_available_simulations():
#    return getattr(simulations, simulation_name)
#  else:
#    print("%s is not available. Available simulations are:" % (simulation_name))
#    print_available_simulations()


#def print_simulation_info(simulation_name):
#  simulation_function = get_simulation_function(simulation_name)
#  if simulation_function is not None:
#    print(simulation_function.__doc__)


#def get_simulation_data(simulation_name,
#                        simulation_parameters,
#                        test_set_size=4000,
#                        validation_set_size=3200):
#  simulation_function = get_simulation_function(simulation_name)
#  sequences, y, embeddings = simulation_function(**simulation_parameters)
#  if simulation_name == "simulate_heterodimer_grammar":
#    motif_names = [
#        simulation_parameters["motif1"], simulation_parameters["motif2"]
#    ]
#  elif simulation_name == "simulate_multi_motif_embedding":
#    motif_names = simulation_parameters["motif_names"]
#  else:
#    motif_names = [simulation_parameters["motif_name"]]
#
#  train_sequences, test_sequences, train_embeddings, test_embeddings, y_train, y_test = \
#      train_test_split(sequences, embeddings, y, test_size=test_set_size)
#  train_sequences, valid_sequences, train_embeddings, valid_embeddings, y_train, y_valid = \
#      train_test_split(train_sequences, train_embeddings, y_train, test_size=validation_set_size)
#  X_train = one_hot_encode(train_sequences)
#  X_valid = one_hot_encode(valid_sequences)
#  X_test = one_hot_encode(test_sequences)
#
#  print("X_train.shape")
#  print(X_train.shape)
#  print("X_valid.shape")
#  print(X_valid.shape)
#  print("X_test.shape")
#  print(X_test.shape)
#  print("y_train.shape")
#  print(y_train.shape)
#  print("y_valid.shape")
#  print(y_valid.shape)
#  print("y_test.shape")
#  print(y_test.shape)
#
#  return Data(X_train, X_valid, X_test, train_embeddings, valid_embeddings,
#              test_embeddings, y_train, y_valid, y_test, motif_names)


#def inspect_SequenceDNN():
#  print(inspect.getdoc(SequenceDNN))
#  print("\nAvailable methods:\n")
#  for (method_name, _) in inspect.getmembers(
#      SequenceDNN, predicate=inspect.ismethod):
#    if method_name != "__init__":
#      print(method_name)


def get_SequenceDNN(SequenceDNN_parameters):
+20 −73
Original line number Diff line number Diff line
@@ -4,8 +4,11 @@ Code adapated from github.com/kundajelab/dragonn
repository. The SequenceDNN class is useful for prediction
tasks working with genomic data.
"""
import tensorflow as tf
from deepchem.nn.regularizers import l1
from deepchem.models import Sequential
from deepchem.models.tensorgraph import layers
from deepchem.data import NumpyDataset

class SequenceDNN(Sequential):
  """
@@ -35,84 +38,28 @@ class SequenceDNN(Sequential):
               seq_length,
               use_RNN=False,
               num_tasks=1,
               num_filters=(15, 15, 15),
               conv_width=(15, 15, 15),
               num_filters=15,
               kernel_size=15,
               pool_width=35,
               GRU_size=35,
               TDD_size=15,
               L1=0,
               dropout=0.0,
               verbose=True):
               verbose=True,
               **kwargs):
    super(SequenceDNN, self).__init__(**kwargs)
    self.num_tasks = num_tasks
    self.verbose = verbose
    assert len(num_filters) == len(conv_width)
    for i, (nb_filter, nb_col) in enumerate(zip(num_filters, conv_width)):
      conv_height = 4 if i == 0 else 1
      self.add(
          layers.Conv2D(
            nb_filter=nb_filter,
            nb_row=conv_height,
            nb_col=nb_col,
            activation='linear',
            init='he_normal',
            input_shape=(1, 4, seq_length),
            W_regularizer=l1(L1),
            b_regularizer=l1(L1)))
      self.add(Activation('relu'))
      self.add(Dropout(dropout))
      self.add(MaxPooling2D(pool_size=(1, pool_width)))
      if use_RNN:
        num_max_pool_outputs = self.model.layers[-1].output_shape[-1]
        self.add(Reshape((num_filters[-1], num_max_pool_outputs)))
        self.add(Permute((2, 1)))
        self.add(GRU(GRU_size, return_sequences=True))
        self.add(TimeDistributedDense(TDD_size, activation='relu'))
      self.add(Flatten())
      self.add(Dense(output_dim=self.num_tasks))
      self.add(Activation('sigmoid'))
      self.compile(optimizer='adam', loss='binary_crossentropy')
    else:
      raise ValueError(
          "Exactly one of seq_length or keras_model must be specified!")

  def train(self,
            X,
            y,
            validation_data,
            early_stopping_metric='Loss',
            early_stopping_patience=5,
            save_best_model_to_prefix=None):
    if y.dtype != bool:
      assert set(np.unique(y)) == {0, 1}
      y = y.astype(bool)
    multitask = y.shape[1] > 1
    if not multitask:
      num_positives = y.sum()
      num_sequences = len(y)
      num_negatives = num_sequences - num_positives
    if self.verbose:
      print('Training model (* indicates new best result)...')
    X_valid, y_valid = validation_data
    early_stopping_wait = 0
    best_metric = np.inf if early_stopping_metric == 'Loss' else -np.inf
    for epoch in range(1, self.num_epochs + 1):
      self.model.fit(
          X,
          y,
          batch_size=128,
          nb_epoch=1,
          class_weight={
              True: num_sequences / num_positives,
              False: num_sequences / num_negatives
          } if not multitask else None,
          verbose=self.verbose)
      if self.verbose:
        print('Epoch {}:'.format(epoch))
      if self.verbose:
        print()
    if self.verbose:
      print('Finished training after {} epochs.'.format(epoch))
      if save_best_model_to_prefix is not None:
        print("The best model's architecture and weights (from epoch {0}) "
              'were saved to {1}.arch.json and {1}.weights.h5'.format(
                  best_epoch, save_best_model_to_prefix))
    self.add(layers.Conv2D(num_filters, kernel_size=kernel_size))
    self.add(layers.Dropout(dropout))
    self.add(layers.MaxPool2D())
    #if use_RNN:
    #  num_max_pool_outputs = self.model.layers[-1].output_shape[-1]
    #  self.add(Reshape((num_filters[-1], num_max_pool_outputs)))
    #  self.add(Permute((2, 1)))
    #  self.add(GRU(GRU_size, return_sequences=True))
    #  self.add(TimeDistributedDense(TDD_size, activation='relu'))
    self.add(layers.Flatten())
    self.add(layers.Dense(self.num_tasks, activation_fn=tf.nn.relu))
    #self.add(Activation('sigmoid'))
+13 −1
Original line number Diff line number Diff line
@@ -4,7 +4,19 @@ import unittest

class TestSequenceDNN(unittest.TestCase):

  def test_sequence_dnn_init(self):
  def test_seq_dnn_init(self):
    """Test SequenceDNN can be initialized."""
    model = dc.models.SequenceDNN(10)
    
  def test_seq_dnn_train(self):
    """Test SequenceDNN training works."""
    X = np.random.rand(10, 1, 4, 50)
    y = np.random.randint(0, 2, size=(10, 1))
    #  # TODO(rbharath): Transform these into useful weights.
    #  #class_weight={
    #  #    True: num_sequences / num_positives,
    #  #    False: num_sequences / num_negatives
    #  #} if not multitask else None,
    dataset = dc.data.NumpyDataset(X, y)
    model = dc.models.SequenceDNN(50)
    model.fit(dataset, "binary_crossentropy", nb_epoch=1)
Loading