Commit 08dacbd7 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

added some sim tests

parent a45023c4
Loading
Loading
Loading
Loading
+1 −6
Original line number Diff line number Diff line
@@ -15,13 +15,8 @@ install:
- conda config --add channels http://conda.binstar.org/omnia
- bash scripts/install_deepchem_conda.sh deepchem
- source activate deepchem
- pip install yapf==0.20.0
- pip install yapf==0.19.0
- pip install coveralls
- cd $HOME
- git clone https://github.com/kundajelab/simdna.git
- cd simdna
- python setup.py develop
- cd $HOME/deepchem
- python setup.py install
script:
- nosetests --with-flaky -a '!slow' --with-timer --with-coverage --cover-package=deepchem -v deepchem --nologcapture
+3 −0
Original line number Diff line number Diff line
@@ -26,5 +26,8 @@ from deepchem.molnet.load_function.sider_datasets import load_sider
from deepchem.molnet.load_function.tox21_datasets import load_tox21
from deepchem.molnet.load_function.toxcast_datasets import load_toxcast

from deepchem.molnet.dnasim import simulate_motif_density_localization
from deepchem.molnet.dnasim import simulate_motif_counting

from deepchem.molnet.run_benchmark import run_benchmark
from deepchem.molnet.run_benchmark_low_data import run_benchmark_low_data
+397 −0
Original line number Diff line number Diff line
from __future__ import absolute_import, division, print_function
from collections import OrderedDict
import numpy as np
import simdna
from simdna.synthetic import (
    RepeatedEmbedder,
    SubstringEmbedder,
    ReverseComplementWrapper,
    UniformPositionGenerator,
    InsideCentralBp,
    LoadedEncodeMotifs,
    PwmSamplerFromLoadedMotifs,
    UniformIntegerGenerator,
    ZeroOrderBackgroundGenerator,
    EmbedInABackground,
    GenerateSequenceNTimes,
    RandomSubsetOfEmbedders,
    IsInTraceLabelGenerator,
    EmbeddableEmbedder,
    PairEmbeddableGenerator,
)
from simdna.util import DiscreteDistribution

loaded_motifs = LoadedEncodeMotifs(
    simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)


def get_distribution(GC_fraction):
  return DiscreteDistribution({
      'A': (1 - GC_fraction) / 2,
      'C': GC_fraction / 2,
      'G': GC_fraction / 2,
      'T': (1 - GC_fraction) / 2
  })


def simple_motif_embedding(motif_name, seq_length, num_seqs, GC_fraction):
  """
  Simulates sequences with a motif embedded anywhere in the sequence.

  Parameters
  ----------
  motif_name : str
      encode motif name
  seq_length : int
      length of sequence
  num_seqs: int
      number of sequences
  GC_fraction : float
      GC basepair fraction in background sequence

  Returns
  -------
  sequence_arr : 1darray
      Array with sequence strings.
  embedding_arr: 1darray
      Array of embedding objects.
  """
  if motif_name is None:
    embedders = []
  else:
    substring_generator = PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name)
    embedders = [
        SubstringEmbedder(ReverseComplementWrapper(substring_generator))
    ]
  embed_in_background = EmbedInABackground(
      ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      embedders)
  generated_sequences = tuple(
      GenerateSequenceNTimes(embed_in_background, num_seqs).generateSequences())
  sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  return sequence_arr, embedding_arr


def motif_density(motif_name,
                  seq_length,
                  num_seqs,
                  min_counts,
                  max_counts,
                  GC_fraction,
                  central_bp=None):
  """
  Returns sequences with motif density, along with embeddings array.
  """
  substring_generator = PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name)
  if central_bp is not None:
    position_generator = InsideCentralBp(central_bp)
  else:
    position_generator = UniformPositionGenerator()
  quantity_generator = UniformIntegerGenerator(min_counts, max_counts)
  embedders = [
      RepeatedEmbedder(
          SubstringEmbedder(
              ReverseComplementWrapper(substring_generator),
              position_generator), quantity_generator)
  ]
  embed_in_background = EmbedInABackground(
      ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      embedders)
  generated_sequences = tuple(
      GenerateSequenceNTimes(embed_in_background, num_seqs).generateSequences())
  sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  return sequence_arr, embedding_arr


def simulate_single_motif_detection(motif_name, seq_length, num_pos, num_neg,
                                    GC_fraction):
  """
    Simulates two classes of seqeuences:
        - Positive class sequence with a motif
          embedded anywhere in the sequence
        - Negative class sequence without the motif

    Parameters
    ----------
    motif_name : str
        encode motif name
    seq_length : int
        length of sequence
    num_pos : int
        number of positive class sequences
    num_neg : int
        number of negative class sequences
    GC_fraction : float
        GC fraction in background sequence

    Returns
    -------
    sequence_arr : 1darray
        Array with sequence strings.
    y : 1darray
        Array with positive/negative class labels.
    embedding_arr: 1darray
        Array of embedding objects.
    """
  motif_sequence_arr, positive_embedding_arr = simple_motif_embedding(
      motif_name, seq_length, num_pos, GC_fraction)
  random_sequence_arr, negative_embedding_arr = simple_motif_embedding(
      None, seq_length, num_neg, GC_fraction)
  sequence_arr = np.concatenate((motif_sequence_arr, random_sequence_arr))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr


def simulate_motif_counting(motif_name, seq_length, pos_counts, neg_counts,
                            num_pos, num_neg, GC_fraction):
  """
  Generates data for motif counting task.

  Parameters
  ----------
  motif_name : str
  seq_length : int
  pos_counts : list
      (min_counts, max_counts) for positive set.
  neg_counts : list
      (min_counts, max_counts) for negative set.
  num_pos : int
  num_neg : int
  GC_fraction : float

  Returns
  -------
  sequence_arr : 1darray
      Contains sequence strings.
  y : 1darray
      Contains labels.
  embedding_arr: 1darray
      Array of embedding objects.
  """
  pos_count_sequence_array, positive_embedding_arr = motif_density(
      motif_name, seq_length, num_pos, pos_counts[0], pos_counts[1],
      GC_fraction)
  neg_count_sequence_array, negative_embedding_arr = motif_density(
      motif_name, seq_length, num_pos, neg_counts[0], neg_counts[1],
      GC_fraction)
  sequence_arr = np.concatenate((pos_count_sequence_array,
                                 neg_count_sequence_array))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr


def simulate_motif_density_localization(motif_name, seq_length, center_size,
                                        min_motif_counts, max_motif_counts,
                                        num_pos, num_neg, GC_fraction):
  """
  Simulates two classes of seqeuences:
      - Positive class sequences with multiple motif instances
        in center of the sequence.
      - Negative class sequences with multiple motif instances
        anywhere in the sequence.
  The number of motif instances is uniformly sampled
  between minimum and maximum motif counts.

  Parameters
  ----------
  motif_name : str
      encode motif name
  seq_length : int
      length of sequence
  center_size : int
      length of central part of the sequence where motifs can be positioned
  min_motif_counts : int
      minimum number of motif instances
  max_motif_counts : int
      maximum number of motif instances
  num_pos : int
      number of positive class sequences
  num_neg : int
      number of negative class sequences
  GC_fraction : float
      GC fraction in background sequence

  Returns
  -------
  sequence_arr : 1darray
      Contains sequence strings.
  y : 1darray
      Contains labels.
  embedding_arr: 1darray
      Array of embedding objects.
  """
  localized_density_sequence_array, positive_embedding_arr = motif_density(
      motif_name, seq_length, num_pos, min_motif_counts, max_motif_counts,
      GC_fraction, center_size)
  unlocalized_density_sequence_array, negative_embedding_arr = motif_density(
      motif_name, seq_length, num_neg, min_motif_counts, max_motif_counts,
      GC_fraction)
  sequence_arr = np.concatenate((localized_density_sequence_array,
                                 unlocalized_density_sequence_array))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr


def simulate_multi_motif_embedding(motif_names, seq_length, min_num_motifs,
                                   max_num_motifs, num_seqs, GC_fraction):
  """
    Generates data for multi motif recognition task.

    Parameters
    ----------
    motif_names : list
        List of strings.
    seq_length : int
    min_num_motifs : int
    max_num_motifs : int
    num_seqs : int
    GC_fraction : float

    Returns
    -------
    sequence_arr : 1darray
        Contains sequence strings.
    y : ndarray
        Contains labels for each motif.
    embedding_arr: 1darray
        Array of embedding objects.
    """

  def get_embedder(motif_name):
    substring_generator = PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name)
    return SubstringEmbedder(
        ReverseComplementWrapper(substring_generator), name=motif_name)

  embedders = [get_embedder(motif_name) for motif_name in motif_names]
  quantity_generator = UniformIntegerGenerator(min_num_motifs, max_num_motifs)
  combined_embedder = [RandomSubsetOfEmbedders(quantity_generator, embedders)]
  embed_in_background = EmbedInABackground(
      ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      combined_embedder)
  generated_sequences = tuple(
      GenerateSequenceNTimes(embed_in_background, num_seqs).generateSequences())
  sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  label_generator = IsInTraceLabelGenerator(np.asarray(motif_names))
  y = np.array(
      [
          label_generator.generateLabels(generated_seq)
          for generated_seq in generated_sequences
      ],
      dtype=bool)
  embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  return sequence_arr, y, embedding_arr


def simulate_differential_accessibility(
    pos_motif_names, neg_motif_names, seq_length, min_num_motifs,
    max_num_motifs, num_pos, num_neg, GC_fraction):
  """
    Generates data for differential accessibility task.

    Parameters
    ----------
    pos_motif_names : list
        List of strings.
    neg_motif_names : list
        List of strings.
    seq_length : int
    min_num_motifs : int
    max_num_motifs : int
    num_pos : int
    num_neg : int
    GC_fraction : float

    Returns
    -------
    sequence_arr : 1darray
        Contains sequence strings.
    y : 1darray
        Contains labels.
    embedding_arr: 1darray
        Array of embedding objects.
    """
  pos_motif_sequence_arr, _, positive_embedding_arr = simulate_multi_motif_embedding(
      pos_motif_names, seq_length, min_num_motifs, max_num_motifs, num_pos,
      GC_fraction)
  neg_motif_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
      neg_motif_names, seq_length, min_num_motifs, max_num_motifs, num_neg,
      GC_fraction)
  sequence_arr = np.concatenate((pos_motif_sequence_arr,
                                 neg_motif_sequence_arr))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr


def simulate_heterodimer_grammar(motif1, motif2, seq_length, min_spacing,
                                 max_spacing, num_pos, num_neg, GC_fraction):
  """
    Simulates two classes of sequences with motif1 and motif2:
        - Positive class sequences with motif1 and motif2 positioned
          min_spacing and max_spacing
        - Negative class sequences with independent motif1 and motif2 positioned
        anywhere in the sequence, not as a heterodimer grammar

    Parameters
    ----------
    seq_length : int, length of sequence
    GC_fraction : float, GC fraction in background sequence
    num_pos : int, number of positive class sequences
    num_neg : int, number of negatice class sequences
    motif1 : str, encode motif name
    motif2 : str, encode motif name
    min_spacing : int, minimum inter motif spacing
    max_spacing : int, maximum inter motif spacing

    Returns
    -------
    sequence_arr : 1darray
        Array with sequence strings.
    y : 1darray
        Array with positive/negative class labels.
    embedding_arr: list
        List of embedding objects.
    """

  motif1_generator = ReverseComplementWrapper(
      PwmSamplerFromLoadedMotifs(loaded_motifs, motif1))
  motif2_generator = ReverseComplementWrapper(
      PwmSamplerFromLoadedMotifs(loaded_motifs, motif2))
  separation_generator = UniformIntegerGenerator(min_spacing, max_spacing)
  embedder = EmbeddableEmbedder(
      PairEmbeddableGenerator(motif1_generator, motif2_generator,
                              separation_generator))
  embed_in_background = EmbedInABackground(
      ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      [embedder])
  generated_sequences = tuple(
      GenerateSequenceNTimes(embed_in_background, num_pos).generateSequences())
  grammar_sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  positive_embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  nongrammar_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
      [motif1, motif2], seq_length, 2, 2, num_neg, GC_fraction)
  sequence_arr = np.concatenate((grammar_sequence_arr, nongrammar_sequence_arr))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr
+39 −0
Original line number Diff line number Diff line
import deepchem as dc
import numpy as np
import unittest


class TestDNASim(unittest.TestCase):

  def test_motif_density_localization_simulation(self):
    "Test motif density localization simulation." ""
    motif_density_localization_simulation_parameters = {
        "motif_name": "TAL1_known4",
        "seq_length": 1000,
        "center_size": 150,
        "min_motif_counts": 2,
        "max_motif_counts": 4,
        "num_pos": 30,
        "num_neg": 30,
        "GC_fraction": 0.4
    }
    sequences, y, embed = dc.molnet.simulate_motif_density_localization(
        **motif_density_localization_simulation_parameters)
    assert sequences.shape == (60,)
    assert y.shape == (60, 1)

  def test_motif_counting_simulation(self):
    "Test motif counting"
    motif_count_simulation_parameters = {
        "motif_name": "TAL1_known4",
        "seq_length": 1000,
        "pos_counts": [5, 10],
        "neg_counts": [1, 2],
        "num_pos": 30,
        "num_neg": 30,
        "GC_fraction": 0.4
    }
    sequences, y, embed = dc.molnet.simulate_motif_counting(
        **motif_count_simulation_parameters)
    assert sequences.shape == (60,)
    assert y.shape == (60, 1)
+7 −4
Original line number Diff line number Diff line
from setuptools import setup

setup(
    setup_requires=['pbr'],
    pbr=True,
)
config = {
  'install_requires': ['simdna==0.3'],
  'dependency_links': ["https://github.com/kundajelab/simdna/tarball/0.3#egg=simdna-0.3"],
  'setup_requires'=['pbr'],
  'pbr': True
}
setup(**config)