Unverified Commit 2e3a120a authored by Bharath Ramsundar's avatar Bharath Ramsundar Committed by GitHub
Browse files

Merge pull request #1008 from rbharath/simulations

Adding DNA simulations
parents e20c1451 6f7d4f4d
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -26,5 +26,11 @@ from deepchem.molnet.load_function.sider_datasets import load_sider
from deepchem.molnet.load_function.tox21_datasets import load_tox21
from deepchem.molnet.load_function.toxcast_datasets import load_toxcast

from deepchem.molnet.dnasim import simulate_motif_density_localization
from deepchem.molnet.dnasim import simulate_motif_counting
from deepchem.molnet.dnasim import simple_motif_embedding
from deepchem.molnet.dnasim import motif_density
from deepchem.molnet.dnasim import simulate_single_motif_detection

from deepchem.molnet.run_benchmark import run_benchmark
from deepchem.molnet.run_benchmark_low_data import run_benchmark_low_data
+397 −0
Original line number Diff line number Diff line
from __future__ import absolute_import, division, print_function
from collections import OrderedDict
import numpy as np
import simdna
from simdna.synthetic import (
    RepeatedEmbedder,
    SubstringEmbedder,
    ReverseComplementWrapper,
    UniformPositionGenerator,
    InsideCentralBp,
    LoadedEncodeMotifs,
    PwmSamplerFromLoadedMotifs,
    UniformIntegerGenerator,
    ZeroOrderBackgroundGenerator,
    EmbedInABackground,
    GenerateSequenceNTimes,
    RandomSubsetOfEmbedders,
    IsInTraceLabelGenerator,
    EmbeddableEmbedder,
    PairEmbeddableGenerator,
)
from simdna.util import DiscreteDistribution

loaded_motifs = LoadedEncodeMotifs(
    simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)


def get_distribution(GC_fraction):
  return DiscreteDistribution({
      'A': (1 - GC_fraction) / 2,
      'C': GC_fraction / 2,
      'G': GC_fraction / 2,
      'T': (1 - GC_fraction) / 2
  })


def simple_motif_embedding(motif_name, seq_length, num_seqs, GC_fraction):
  """
  Simulates sequences with a motif embedded anywhere in the sequence.

  Parameters
  ----------
  motif_name : str
      encode motif name
  seq_length : int
      length of sequence
  num_seqs: int
      number of sequences
  GC_fraction : float
      GC basepair fraction in background sequence

  Returns
  -------
  sequence_arr : 1darray
      Array with sequence strings.
  embedding_arr: 1darray
      Array of embedding objects.
  """
  if motif_name is None:
    embedders = []
  else:
    substring_generator = PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name)
    embedders = [
        SubstringEmbedder(ReverseComplementWrapper(substring_generator))
    ]
  embed_in_background = EmbedInABackground(
      ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      embedders)
  generated_sequences = tuple(
      GenerateSequenceNTimes(embed_in_background, num_seqs).generateSequences())
  sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  return sequence_arr, embedding_arr


def motif_density(motif_name,
                  seq_length,
                  num_seqs,
                  min_counts,
                  max_counts,
                  GC_fraction,
                  central_bp=None):
  """
  Returns sequences with motif density, along with embeddings array.
  """
  substring_generator = PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name)
  if central_bp is not None:
    position_generator = InsideCentralBp(central_bp)
  else:
    position_generator = UniformPositionGenerator()
  quantity_generator = UniformIntegerGenerator(min_counts, max_counts)
  embedders = [
      RepeatedEmbedder(
          SubstringEmbedder(
              ReverseComplementWrapper(substring_generator),
              position_generator), quantity_generator)
  ]
  embed_in_background = EmbedInABackground(
      ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      embedders)
  generated_sequences = tuple(
      GenerateSequenceNTimes(embed_in_background, num_seqs).generateSequences())
  sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  return sequence_arr, embedding_arr


def simulate_single_motif_detection(motif_name, seq_length, num_pos, num_neg,
                                    GC_fraction):
  """
    Simulates two classes of seqeuences:
        - Positive class sequence with a motif
          embedded anywhere in the sequence
        - Negative class sequence without the motif

    Parameters
    ----------
    motif_name : str
        encode motif name
    seq_length : int
        length of sequence
    num_pos : int
        number of positive class sequences
    num_neg : int
        number of negative class sequences
    GC_fraction : float
        GC fraction in background sequence

    Returns
    -------
    sequence_arr : 1darray
        Array with sequence strings.
    y : 1darray
        Array with positive/negative class labels.
    embedding_arr: 1darray
        Array of embedding objects.
    """
  motif_sequence_arr, positive_embedding_arr = simple_motif_embedding(
      motif_name, seq_length, num_pos, GC_fraction)
  random_sequence_arr, negative_embedding_arr = simple_motif_embedding(
      None, seq_length, num_neg, GC_fraction)
  sequence_arr = np.concatenate((motif_sequence_arr, random_sequence_arr))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr


def simulate_motif_counting(motif_name, seq_length, pos_counts, neg_counts,
                            num_pos, num_neg, GC_fraction):
  """
  Generates data for motif counting task.

  Parameters
  ----------
  motif_name : str
  seq_length : int
  pos_counts : list
      (min_counts, max_counts) for positive set.
  neg_counts : list
      (min_counts, max_counts) for negative set.
  num_pos : int
  num_neg : int
  GC_fraction : float

  Returns
  -------
  sequence_arr : 1darray
      Contains sequence strings.
  y : 1darray
      Contains labels.
  embedding_arr: 1darray
      Array of embedding objects.
  """
  pos_count_sequence_array, positive_embedding_arr = motif_density(
      motif_name, seq_length, num_pos, pos_counts[0], pos_counts[1],
      GC_fraction)
  neg_count_sequence_array, negative_embedding_arr = motif_density(
      motif_name, seq_length, num_pos, neg_counts[0], neg_counts[1],
      GC_fraction)
  sequence_arr = np.concatenate((pos_count_sequence_array,
                                 neg_count_sequence_array))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr


def simulate_motif_density_localization(motif_name, seq_length, center_size,
                                        min_motif_counts, max_motif_counts,
                                        num_pos, num_neg, GC_fraction):
  """
  Simulates two classes of seqeuences:
      - Positive class sequences with multiple motif instances
        in center of the sequence.
      - Negative class sequences with multiple motif instances
        anywhere in the sequence.
  The number of motif instances is uniformly sampled
  between minimum and maximum motif counts.

  Parameters
  ----------
  motif_name : str
      encode motif name
  seq_length : int
      length of sequence
  center_size : int
      length of central part of the sequence where motifs can be positioned
  min_motif_counts : int
      minimum number of motif instances
  max_motif_counts : int
      maximum number of motif instances
  num_pos : int
      number of positive class sequences
  num_neg : int
      number of negative class sequences
  GC_fraction : float
      GC fraction in background sequence

  Returns
  -------
  sequence_arr : 1darray
      Contains sequence strings.
  y : 1darray
      Contains labels.
  embedding_arr: 1darray
      Array of embedding objects.
  """
  localized_density_sequence_array, positive_embedding_arr = motif_density(
      motif_name, seq_length, num_pos, min_motif_counts, max_motif_counts,
      GC_fraction, center_size)
  unlocalized_density_sequence_array, negative_embedding_arr = motif_density(
      motif_name, seq_length, num_neg, min_motif_counts, max_motif_counts,
      GC_fraction)
  sequence_arr = np.concatenate((localized_density_sequence_array,
                                 unlocalized_density_sequence_array))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr


def simulate_multi_motif_embedding(motif_names, seq_length, min_num_motifs,
                                   max_num_motifs, num_seqs, GC_fraction):
  """
    Generates data for multi motif recognition task.

    Parameters
    ----------
    motif_names : list
        List of strings.
    seq_length : int
    min_num_motifs : int
    max_num_motifs : int
    num_seqs : int
    GC_fraction : float

    Returns
    -------
    sequence_arr : 1darray
        Contains sequence strings.
    y : ndarray
        Contains labels for each motif.
    embedding_arr: 1darray
        Array of embedding objects.
    """

  def get_embedder(motif_name):
    substring_generator = PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name)
    return SubstringEmbedder(
        ReverseComplementWrapper(substring_generator), name=motif_name)

  embedders = [get_embedder(motif_name) for motif_name in motif_names]
  quantity_generator = UniformIntegerGenerator(min_num_motifs, max_num_motifs)
  combined_embedder = [RandomSubsetOfEmbedders(quantity_generator, embedders)]
  embed_in_background = EmbedInABackground(
      ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      combined_embedder)
  generated_sequences = tuple(
      GenerateSequenceNTimes(embed_in_background, num_seqs).generateSequences())
  sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  label_generator = IsInTraceLabelGenerator(np.asarray(motif_names))
  y = np.array(
      [
          label_generator.generateLabels(generated_seq)
          for generated_seq in generated_sequences
      ],
      dtype=bool)
  embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  return sequence_arr, y, embedding_arr


def simulate_differential_accessibility(
    pos_motif_names, neg_motif_names, seq_length, min_num_motifs,
    max_num_motifs, num_pos, num_neg, GC_fraction):
  """
    Generates data for differential accessibility task.

    Parameters
    ----------
    pos_motif_names : list
        List of strings.
    neg_motif_names : list
        List of strings.
    seq_length : int
    min_num_motifs : int
    max_num_motifs : int
    num_pos : int
    num_neg : int
    GC_fraction : float

    Returns
    -------
    sequence_arr : 1darray
        Contains sequence strings.
    y : 1darray
        Contains labels.
    embedding_arr: 1darray
        Array of embedding objects.
    """
  pos_motif_sequence_arr, _, positive_embedding_arr = simulate_multi_motif_embedding(
      pos_motif_names, seq_length, min_num_motifs, max_num_motifs, num_pos,
      GC_fraction)
  neg_motif_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
      neg_motif_names, seq_length, min_num_motifs, max_num_motifs, num_neg,
      GC_fraction)
  sequence_arr = np.concatenate((pos_motif_sequence_arr,
                                 neg_motif_sequence_arr))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr


def simulate_heterodimer_grammar(motif1, motif2, seq_length, min_spacing,
                                 max_spacing, num_pos, num_neg, GC_fraction):
  """
    Simulates two classes of sequences with motif1 and motif2:
        - Positive class sequences with motif1 and motif2 positioned
          min_spacing and max_spacing
        - Negative class sequences with independent motif1 and motif2 positioned
        anywhere in the sequence, not as a heterodimer grammar

    Parameters
    ----------
    seq_length : int, length of sequence
    GC_fraction : float, GC fraction in background sequence
    num_pos : int, number of positive class sequences
    num_neg : int, number of negatice class sequences
    motif1 : str, encode motif name
    motif2 : str, encode motif name
    min_spacing : int, minimum inter motif spacing
    max_spacing : int, maximum inter motif spacing

    Returns
    -------
    sequence_arr : 1darray
        Array with sequence strings.
    y : 1darray
        Array with positive/negative class labels.
    embedding_arr: list
        List of embedding objects.
    """

  motif1_generator = ReverseComplementWrapper(
      PwmSamplerFromLoadedMotifs(loaded_motifs, motif1))
  motif2_generator = ReverseComplementWrapper(
      PwmSamplerFromLoadedMotifs(loaded_motifs, motif2))
  separation_generator = UniformIntegerGenerator(min_spacing, max_spacing)
  embedder = EmbeddableEmbedder(
      PairEmbeddableGenerator(motif1_generator, motif2_generator,
                              separation_generator))
  embed_in_background = EmbedInABackground(
      ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      [embedder])
  generated_sequences = tuple(
      GenerateSequenceNTimes(embed_in_background, num_pos).generateSequences())
  grammar_sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  positive_embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  nongrammar_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
      [motif1, motif2], seq_length, 2, 2, num_neg, GC_fraction)
  sequence_arr = np.concatenate((grammar_sequence_arr, nongrammar_sequence_arr))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr
+75 −0
Original line number Diff line number Diff line
import deepchem as dc
import numpy as np
import unittest


class TestDNASim(unittest.TestCase):

  def test_motif_density_localization_simulation(self):
    "Test motif density localization simulation." ""
    params = {
        "motif_name": "TAL1_known4",
        "seq_length": 1000,
        "center_size": 150,
        "min_motif_counts": 2,
        "max_motif_counts": 4,
        "num_pos": 30,
        "num_neg": 30,
        "GC_fraction": 0.4
    }
    sequences, y, embed = dc.molnet.simulate_motif_density_localization(
        **params)
    assert sequences.shape == (60,)
    assert y.shape == (60, 1)

  def test_motif_counting_simulation(self):
    "Test motif counting"
    params = {
        "motif_name": "TAL1_known4",
        "seq_length": 1000,
        "pos_counts": [5, 10],
        "neg_counts": [1, 2],
        "num_pos": 30,
        "num_neg": 30,
        "GC_fraction": 0.4
    }
    sequences, y, embed = dc.molnet.simulate_motif_counting(**params)
    assert sequences.shape == (60,)
    assert y.shape == (60, 1)

  def test_simple_motif_embedding(self):
    "Test simple motif embedding"
    params = {
        "motif_name": "TAL1_known4",
        "seq_length": 1000,
        "num_seqs": 30,
        "GC_fraction": 0.4
    }
    sequences, embed = dc.molnet.simple_motif_embedding(**params)
    assert sequences.shape == (30,)

  def test_motif_density(self):
    "Test motif density"
    params = {
        "motif_name": "TAL1_known4",
        "seq_length": 1000,
        "num_seqs": 30,
        "min_counts": 2,
        "max_counts": 4,
        "GC_fraction": 0.4
    }
    sequences, embed = dc.molnet.motif_density(**params)
    assert sequences.shape == (30,)

  def test_single_motif_detection(self):
    "Test single motif detection"
    params = {
        "motif_name": "TAL1_known4",
        "seq_length": 1000,
        "num_pos": 30,
        "num_neg": 30,
        "GC_fraction": 0.4
    }
    sequences, y, embed = dc.molnet.simulate_single_motif_detection(**params)
    assert sequences.shape == (60,)
    assert y.shape == (60, 1)
+1 −0
Original line number Diff line number Diff line
@@ -44,3 +44,4 @@ conda install -y -q -c conda-forge zlib=1.2.11
conda install -y -q -c conda-forge requests=2.18.4
conda install -y -q -c conda-forge xgboost=0.6a2
conda install -y -q -c rdkit rdkit=2017.09.1
yes | pip install simdna
+2 −4
Original line number Diff line number Diff line
from setuptools import setup

setup(
    setup_requires=['pbr'],
    pbr=True,
)
config = {'setup_requires': ['pbr'], 'pbr': True}
setup(**config)