Commit 3fc05c40 authored by Bharath Ramsundar's avatar Bharath Ramsundar
Browse files

reverting

parent ecc4af56
......@@ -32,6 +32,12 @@ from deepchem.molnet.load_function.thermosol_datasets import load_thermosol
from deepchem.molnet.load_function.hppb_datasets import load_hppb
from deepchem.molnet.load_function.chembl25_datasets import load_chembl25
from deepchem.molnet.dnasim import simulate_motif_density_localization
from deepchem.molnet.dnasim import simulate_motif_counting
from deepchem.molnet.dnasim import simple_motif_embedding
from deepchem.molnet.dnasim import motif_density
from deepchem.molnet.dnasim import simulate_single_motif_detection
from deepchem.molnet.run_benchmark import run_benchmark
#from deepchem.molnet.run_benchmark_low_data import run_benchmark_low_data
from deepchem.molnet import run_benchmark_models
from collections import OrderedDict
import numpy as np
def get_distribution(GC_fraction):
from simdna.util import DiscreteDistribution
return DiscreteDistribution({
'A': (1 - GC_fraction) / 2,
'C': GC_fraction / 2,
'G': GC_fraction / 2,
'T': (1 - GC_fraction) / 2
})
def simple_motif_embedding(motif_name, seq_length, num_seqs, GC_fraction):
"""
Simulates sequences with a motif embedded anywhere in the sequence.
Parameters
----------
motif_name : str
encode motif name
seq_length : int
length of sequence
num_seqs: int
number of sequences
GC_fraction : float
GC basepair fraction in background sequence
Returns
-------
sequence_arr : 1darray
Array with sequence strings.
embedding_arr: 1darray
Array of embedding objects.
"""
import simdna
from simdna import synthetic
if motif_name is None:
embedders = []
else:
loaded_motifs = synthetic.LoadedEncodeMotifs(
simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)
substring_generator = synthetic.PwmSamplerFromLoadedMotifs(
loaded_motifs, motif_name)
embedders = [
synthetic.SubstringEmbedder(
synthetic.ReverseComplementWrapper(substring_generator))
]
embed_in_background = synthetic.EmbedInABackground(
synthetic.ZeroOrderBackgroundGenerator(
seq_length, discreteDistribution=get_distribution(GC_fraction)),
embedders)
generated_sequences = tuple(
synthetic.GenerateSequenceNTimes(embed_in_background,
num_seqs).generateSequences())
sequence_arr = np.array(
[generated_seq.seq for generated_seq in generated_sequences])
embedding_arr = [
generated_seq.embeddings for generated_seq in generated_sequences
]
return sequence_arr, embedding_arr
def motif_density(motif_name,
seq_length,
num_seqs,
min_counts,
max_counts,
GC_fraction,
central_bp=None):
"""
Returns sequences with motif density, along with embeddings array.
"""
import simdna
from simdna import synthetic
loaded_motifs = synthetic.LoadedEncodeMotifs(
simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)
substring_generator = synthetic.PwmSamplerFromLoadedMotifs(
loaded_motifs, motif_name)
if central_bp is not None:
position_generator = synthetic.InsideCentralBp(central_bp)
else:
position_generator = synthetic.UniformPositionGenerator()
quantity_generator = synthetic.UniformIntegerGenerator(min_counts, max_counts)
embedders = [
synthetic.RepeatedEmbedder(
synthetic.SubstringEmbedder(
synthetic.ReverseComplementWrapper(substring_generator),
position_generator), quantity_generator)
]
embed_in_background = synthetic.EmbedInABackground(
synthetic.ZeroOrderBackgroundGenerator(
seq_length, discreteDistribution=get_distribution(GC_fraction)),
embedders)
generated_sequences = tuple(
synthetic.GenerateSequenceNTimes(embed_in_background,
num_seqs).generateSequences())
sequence_arr = np.array(
[generated_seq.seq for generated_seq in generated_sequences])
embedding_arr = [
generated_seq.embeddings for generated_seq in generated_sequences
]
return sequence_arr, embedding_arr
def simulate_single_motif_detection(motif_name, seq_length, num_pos, num_neg,
GC_fraction):
"""
Simulates two classes of seqeuences:
- Positive class sequence with a motif
embedded anywhere in the sequence
- Negative class sequence without the motif
Parameters
----------
motif_name : str
encode motif name
seq_length : int
length of sequence
num_pos : int
number of positive class sequences
num_neg : int
number of negative class sequences
GC_fraction : float
GC fraction in background sequence
Returns
-------
sequence_arr : 1darray
Array with sequence strings.
y : 1darray
Array with positive/negative class labels.
embedding_arr: 1darray
Array of embedding objects.
"""
motif_sequence_arr, positive_embedding_arr = simple_motif_embedding(
motif_name, seq_length, num_pos, GC_fraction)
random_sequence_arr, negative_embedding_arr = simple_motif_embedding(
None, seq_length, num_neg, GC_fraction)
sequence_arr = np.concatenate((motif_sequence_arr, random_sequence_arr))
y = np.array([[True]] * num_pos + [[False]] * num_neg)
embedding_arr = positive_embedding_arr + negative_embedding_arr
return sequence_arr, y, embedding_arr
def simulate_motif_counting(motif_name, seq_length, pos_counts, neg_counts,
num_pos, num_neg, GC_fraction):
"""
Generates data for motif counting task.
Parameters
----------
motif_name : str
seq_length : int
pos_counts : list
(min_counts, max_counts) for positive set.
neg_counts : list
(min_counts, max_counts) for negative set.
num_pos : int
num_neg : int
GC_fraction : float
Returns
-------
sequence_arr : 1darray
Contains sequence strings.
y : 1darray
Contains labels.
embedding_arr: 1darray
Array of embedding objects.
"""
pos_count_sequence_array, positive_embedding_arr = motif_density(
motif_name, seq_length, num_pos, pos_counts[0], pos_counts[1],
GC_fraction)
neg_count_sequence_array, negative_embedding_arr = motif_density(
motif_name, seq_length, num_pos, neg_counts[0], neg_counts[1],
GC_fraction)
sequence_arr = np.concatenate((pos_count_sequence_array,
neg_count_sequence_array))
y = np.array([[True]] * num_pos + [[False]] * num_neg)
embedding_arr = positive_embedding_arr + negative_embedding_arr
return sequence_arr, y, embedding_arr
def simulate_motif_density_localization(motif_name, seq_length, center_size,
min_motif_counts, max_motif_counts,
num_pos, num_neg, GC_fraction):
"""
Simulates two classes of seqeuences:
- Positive class sequences with multiple motif instances
in center of the sequence.
- Negative class sequences with multiple motif instances
anywhere in the sequence.
The number of motif instances is uniformly sampled
between minimum and maximum motif counts.
Parameters
----------
motif_name : str
encode motif name
seq_length : int
length of sequence
center_size : int
length of central part of the sequence where motifs can be positioned
min_motif_counts : int
minimum number of motif instances
max_motif_counts : int
maximum number of motif instances
num_pos : int
number of positive class sequences
num_neg : int
number of negative class sequences
GC_fraction : float
GC fraction in background sequence
Returns
-------
sequence_arr : 1darray
Contains sequence strings.
y : 1darray
Contains labels.
embedding_arr: 1darray
Array of embedding objects.
"""
localized_density_sequence_array, positive_embedding_arr = motif_density(
motif_name, seq_length, num_pos, min_motif_counts, max_motif_counts,
GC_fraction, center_size)
unlocalized_density_sequence_array, negative_embedding_arr = motif_density(
motif_name, seq_length, num_neg, min_motif_counts, max_motif_counts,
GC_fraction)
sequence_arr = np.concatenate((localized_density_sequence_array,
unlocalized_density_sequence_array))
y = np.array([[True]] * num_pos + [[False]] * num_neg)
embedding_arr = positive_embedding_arr + negative_embedding_arr
return sequence_arr, y, embedding_arr
def simulate_multi_motif_embedding(motif_names, seq_length, min_num_motifs,
max_num_motifs, num_seqs, GC_fraction):
"""
Generates data for multi motif recognition task.
Parameters
----------
motif_names : list
List of strings.
seq_length : int
min_num_motifs : int
max_num_motifs : int
num_seqs : int
GC_fraction : float
Returns
-------
sequence_arr : 1darray
Contains sequence strings.
y : ndarray
Contains labels for each motif.
embedding_arr: 1darray
Array of embedding objects.
"""
import simdna
from simdna import synthetic
loaded_motifs = synthetic.LoadedEncodeMotifs(
simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)
def get_embedder(motif_name):
substring_generator = synthetic.PwmSamplerFromLoadedMotifs(
loaded_motifs, motif_name)
return synthetic.SubstringEmbedder(
synthetic.ReverseComplementWrapper(substring_generator),
name=motif_name)
embedders = [get_embedder(motif_name) for motif_name in motif_names]
quantity_generator = synthetic.UniformIntegerGenerator(
min_num_motifs, max_num_motifs)
combined_embedder = [
synthetic.RandomSubsetOfEmbedders(quantity_generator, embedders)
]
embed_in_background = synthetic.EmbedInABackground(
synthetic.ZeroOrderBackgroundGenerator(
seq_length, discreteDistribution=get_distribution(GC_fraction)),
combined_embedder)
generated_sequences = tuple(
synthetic.GenerateSequenceNTimes(embed_in_background,
num_seqs).generateSequences())
sequence_arr = np.array(
[generated_seq.seq for generated_seq in generated_sequences])
label_generator = synthetic.IsInTraceLabelGenerator(np.asarray(motif_names))
y = np.array(
[
label_generator.generateLabels(generated_seq)
for generated_seq in generated_sequences
],
dtype=bool)
embedding_arr = [
generated_seq.embeddings for generated_seq in generated_sequences
]
return sequence_arr, y, embedding_arr
def simulate_differential_accessibility(
pos_motif_names, neg_motif_names, seq_length, min_num_motifs,
max_num_motifs, num_pos, num_neg, GC_fraction):
"""
Generates data for differential accessibility task.
Parameters
----------
pos_motif_names : list
List of strings.
neg_motif_names : list
List of strings.
seq_length : int
min_num_motifs : int
max_num_motifs : int
num_pos : int
num_neg : int
GC_fraction : float
Returns
-------
sequence_arr : 1darray
Contains sequence strings.
y : 1darray
Contains labels.
embedding_arr: 1darray
Array of embedding objects.
"""
pos_motif_sequence_arr, _, positive_embedding_arr = simulate_multi_motif_embedding(
pos_motif_names, seq_length, min_num_motifs, max_num_motifs, num_pos,
GC_fraction)
neg_motif_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
neg_motif_names, seq_length, min_num_motifs, max_num_motifs, num_neg,
GC_fraction)
sequence_arr = np.concatenate((pos_motif_sequence_arr,
neg_motif_sequence_arr))
y = np.array([[True]] * num_pos + [[False]] * num_neg)
embedding_arr = positive_embedding_arr + negative_embedding_arr
return sequence_arr, y, embedding_arr
def simulate_heterodimer_grammar(motif1, motif2, seq_length, min_spacing,
max_spacing, num_pos, num_neg, GC_fraction):
"""
Simulates two classes of sequences with motif1 and motif2:
- Positive class sequences with motif1 and motif2 positioned
min_spacing and max_spacing
- Negative class sequences with independent motif1 and motif2 positioned
anywhere in the sequence, not as a heterodimer grammar
Parameters
----------
seq_length : int, length of sequence
GC_fraction : float, GC fraction in background sequence
num_pos : int, number of positive class sequences
num_neg : int, number of negatice class sequences
motif1 : str, encode motif name
motif2 : str, encode motif name
min_spacing : int, minimum inter motif spacing
max_spacing : int, maximum inter motif spacing
Returns
-------
sequence_arr : 1darray
Array with sequence strings.
y : 1darray
Array with positive/negative class labels.
embedding_arr: list
List of embedding objects.
"""
import simdna
from simdna import synthetic
loaded_motifs = synthetic.LoadedEncodeMotifs(
simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)
motif1_generator = synthetic.ReverseComplementWrapper(
synthetic.PwmSamplerFromLoadedMotifs(loaded_motifs, motif1))
motif2_generator = synthetic.ReverseComplementWrapper(
synthetic.PwmSamplerFromLoadedMotifs(loaded_motifs, motif2))
separation_generator = synthetic.UniformIntegerGenerator(
min_spacing, max_spacing)
embedder = synthetic.EmbeddableEmbedder(
synthetic.PairEmbeddableGenerator(motif1_generator, motif2_generator,
separation_generator))
embed_in_background = synthetic.EmbedInABackground(
synthetic.ZeroOrderBackgroundGenerator(
seq_length, discreteDistribution=get_distribution(GC_fraction)),
[embedder])
generated_sequences = tuple(
synthetic.GenerateSequenceNTimes(embed_in_background,
num_pos).generateSequences())
grammar_sequence_arr = np.array(
[generated_seq.seq for generated_seq in generated_sequences])
positive_embedding_arr = [
generated_seq.embeddings for generated_seq in generated_sequences
]
nongrammar_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
[motif1, motif2], seq_length, 2, 2, num_neg, GC_fraction)
sequence_arr = np.concatenate((grammar_sequence_arr, nongrammar_sequence_arr))
y = np.array([[True]] * num_pos + [[False]] * num_neg)
embedding_arr = positive_embedding_arr + negative_embedding_arr
return sequence_arr, y, embedding_arr
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment