Commit fd3fcbd5 authored by mparker2's avatar mparker2
Browse files

added gamma fitting of EMD null dist

parent 4d57c0fc
Loading
Loading
Loading
Loading
+10 −6
Original line number Diff line number Diff line
@@ -14,12 +14,14 @@ from .ref_guided import ref_guided_diff_tpe
@click.option('--read-end', type=click.Choice(['3', '5']), default='3')
@click.option('--paired-end-read', type=click.Choice(['1', '2', 'both', 'single']), default='single')
@click.option('--min-read-overlap', default=0.2)
@click.option('--min-reads-per-rep', default=10)
@click.option('--min-reads-per-rep', default=5)
@click.option('--extend-gene-five-prime', default=0)
@click.option('--use-5utr/--ignore-5utr', default=True)
@click.option('--extend-gene-three-prime', default=0)
@click.option('--bootstraps', default=1999)
@click.option('--bootstraps', default=999)
@click.option('--threshold', default=0.05)
@click.option('--use-gamma-model/--no-model', default=True)
@click.option('--test-homogeneity/--no-test-homogeneity', default=False)
@click.option('--tpe-cluster-sigma', default=15)
@click.option('--min-tpe-reads', default=5)
@click.option('--min-tpe-fractional-change', default=0.1)
@@ -31,11 +33,11 @@ def d3pendr(treatment_fns, control_fns,
            min_read_overlap, min_reads_per_rep,
            extend_gene_five_prime, use_5utr,
            extend_gene_three_prime,
            bootstraps, threshold, tpe_cluster_sigma,
            min_tpe_reads, min_tpe_fractional_change,
            bootstraps, threshold, use_gamma_model, test_homogeneity,
            tpe_cluster_sigma, min_tpe_reads, min_tpe_fractional_change,
            processes):
    '''
    d3pendr: Differential 3' End analysis of Nanopore Direct RNAseq
    d3pendr: Differential 3' End analysis of Nanopore Direct RNAs

    Identifies differential polyadenylation events in either an
    annotation dependent manner, using
@@ -56,7 +58,9 @@ def d3pendr(treatment_fns, control_fns,
        min_read_overlap, min_reads_per_rep,
        extend_gene_five_prime, use_5utr,
        extend_gene_three_prime,
        bootstraps, threshold, write_apa_sites,
        bootstraps, threshold,
        use_gamma_model, test_homogeneity,
        write_apa_sites,
        tpe_cluster_sigma, min_tpe_reads, min_tpe_fractional_change,
        processes
    )
+9 −5
Original line number Diff line number Diff line
@@ -160,8 +160,9 @@ def get_apa_tpes(cntrl_distrib, nreads_cntrl,
def process_gtf_records(gtf_records, treat_bam_fns, cntrl_bam_fns,
                        read_strand, read_end, paired_end_read,
                        min_read_overlap, min_reads,
                        bootstraps, threshold, is_bam,
                        find_apa_tpe_sites, tpe_sigma,
                        bootstraps, threshold,
                        use_gamma_model, test_homogeneity,
                        is_bam, find_apa_tpe_sites, tpe_sigma,
                        tpe_min_reads, tpe_min_rel_change):
    results = []
    tpe_apa_res = []
@@ -182,11 +183,12 @@ def process_gtf_records(gtf_records, treat_bam_fns, cntrl_bam_fns,
                paired_end_read, is_bam
            )


            if (nreads_cntrl >= min_reads).all() and (nreads_treat >= min_reads).all():
                wass_dist, wass_dir, wass_pval = tpe_stats(
                    cntrl_distribs, treat_distribs,
                    bootstraps=bootstraps, threshold=threshold,
                    use_gamma_model=use_gamma_model,
                    test_homogeneity=test_homogeneity,
                )
                results.append([
                    chrom, start, end, gene_id, round(wass_dist), strand,
@@ -243,6 +245,7 @@ def ref_guided_diff_tpe(gtf_fn, treat_bam_fns, cntrl_bam_fns,
                        extend_gene_five_prime, use_5utr,
                        extend_gene_three_prime,
                        bootstraps, threshold,
                        use_gamma_model, test_homogeneity,
                        find_tpe_sites, tpe_sigma,
                        tpe_min_reads, tpe_min_rel_change,
                        processes):
@@ -254,8 +257,9 @@ def ref_guided_diff_tpe(gtf_fn, treat_bam_fns, cntrl_bam_fns,
        treat_bam_fns, cntrl_bam_fns,
        read_strand, read_end, paired_end_read,
        min_read_overlap, min_reads_per_cond,
        bootstraps, threshold, filetype,
        find_tpe_sites, tpe_sigma,
        bootstraps, threshold,
        use_gamma_model, test_homogeneity,
        filetype, find_tpe_sites, tpe_sigma,
        tpe_min_reads, tpe_min_rel_change,
    )
    gtf_it = gtf_iterator(
+23 −9
Original line number Diff line number Diff line
@@ -32,7 +32,7 @@ def wasserstein_distance_and_direction(u_values, v_values):
    return np.sum(np.abs(mag)), np.sum(mag)


def wasserstein_test(u_values, v_values, bootstraps=999):
def wasserstein_test(u_values, v_values, bootstraps=999, use_gamma_model=True):
    # permutation test of wasserstein distance
    # based on the one outlined in https://github.com/cdowd/twosamples
    wass_dist, wass_dir = wasserstein_distance_and_direction(u_values, v_values)
@@ -48,21 +48,32 @@ def wasserstein_test(u_values, v_values, bootstraps=999):
        exp.append(stats.wasserstein_distance(pool[:n], pool[n:]))
    exp = np.array(exp)

    if not use_gamma_model:
        # bootstrap p value with pseudocount
        p_val = ((exp >= wass_dist).sum() + 1) / (bootstraps + 1)
    else:
        # fit a gamma distribution to the expected distances
        g = stats.gamma(*stats.gamma.fit(exp))
        # compute p value using survival function
        p_val = g.sf(wass_dist)
    return wass_dist, wass_dir, p_val


def wasserstein_test_with_replicates(u_values, v_values,
                                     bootstraps=999, threshold=0.05):
                                     bootstraps=999,
                                     threshold=0.05,
                                     use_gamma_model=True,
                                     test_homogeneity=False):
    # first pool replicates and perform normal wass test
    u_pooled = np.concatenate(u_values)
    v_pooled = np.concatenate(v_values)
    wass_dist, wass_dir, p_val = wasserstein_test(u_pooled, v_pooled, bootstraps=bootstraps)
    wass_dist, wass_dir, p_val = wasserstein_test(
        u_pooled, v_pooled, bootstraps=bootstraps, use_gamma_model=use_gamma_model
    )

    # we only bother testing for homogeneity of replicates if the test between
    # replicates is significant
    if p_val <= threshold:
    if test_homogeneity and p_val <= threshold:

        # produce pairwise distances between replicates for both conditions
        u_self_wass = np.array(
@@ -83,14 +94,17 @@ def wasserstein_test_with_replicates(u_values, v_values,
    return wass_dist, wass_dir, p_val


def tpe_stats(tpe_dist_cntrl, tpe_dist_treat, bootstraps=999, threshold=0.05):
def tpe_stats(tpe_dist_cntrl, tpe_dist_treat,
              bootstraps=999, threshold=0.05,
              use_gamma_model=True, test_homogeneity=False):
    if len(tpe_dist_cntrl) == 1:
        wass_dist, wass_dir, wass_pval = wasserstein_test(
            tpe_dist_cntrl[0], tpe_dist_treat[0], bootstraps
            tpe_dist_cntrl[0], tpe_dist_treat[0], bootstraps, use_gamma_model,
        )
    else:
        wass_dist, wass_dir, wass_pval = wasserstein_test_with_replicates(
            tpe_dist_cntrl, tpe_dist_treat, bootstraps, threshold
            tpe_dist_cntrl, tpe_dist_treat, bootstraps,
            threshold, use_gamma_model, test_homogeneity,
        )

    return wass_dist, wass_dir, wass_pval
 No newline at end of file