Commit 53357041 authored by mparker2's avatar mparker2
Browse files

update to be able to use bigwigs, add internal priming filtering script

parent 68945734
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
def relative_tpe(gene_start, gene_end, aln_start, aln_end, strand, read_end):
    # make tpe position relative to gene start rather than 
    # genomic coordinates
    if read_end == '3':
        if strand == '+':
            return aln_end - gene_start
        else:
            return gene_end - aln_start
    elif read_end == '5':
        if strand == '+':
            return aln_start - gene_start
        else:
            return gene_end - aln_end


def intersect(inv_a, inv_b):
    a_start, a_end = inv_a
    b_start, b_end = inv_b
+147 −38
Original line number Diff line number Diff line
import heapq
from operator import itemgetter
import numpy as np
import pysam
import pyBigWig as pybw


def bam_cigar_to_invs(aln):
@@ -30,74 +29,171 @@ def bam_cigar_to_invs(aln):
    return start, end, strand, np.array(invs)


def pair_filter(filt_type):
    if filt_type == 'both':
        def _pair_filter(aln):
            return True
    elif filt_type == '1':
        def _pair_filter(aln):
            return aln.is_read1
    elif filt_type == '2':
        def _pair_filter(aln):
            return aln.is_read2
    return _pair_filter


def bam_query_iterator(bam, *args, **kwargs):
    strand = kwargs.pop('strand', None)
    pairs = kwargs.pop('pairs', 'both')
    pair_filt = pair_filter(pairs)
    if strand is None:
        for aln in bam.fetch(*args, **kwargs):
            if pair_filt(aln):
                yield bam_cigar_to_invs(aln)
    elif strand in '+-':
        is_reverse = strand == '-'
        for aln in bam.fetch(*args, **kwargs):
            if is_reverse == aln.is_reverse:
            if is_reverse == aln.is_reverse and pair_filt(aln):
                yield bam_cigar_to_invs(aln)
    else:
        raise ValueError('strand is not one of +-')


class MultiParser(object):

class MultiBamParser(object):
    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def __init__(self, bam_fns, *, sort_fetch=False):
        self.bam_handles = {

class MultiBamParser(MultiParser):

    def __init__(self, bam_fns):
        self.handles = {
            bam_fn: pysam.AlignmentFile(bam_fn) for bam_fn in bam_fns
        }
        self.closed = False
        self._sort_fetch = sort_fetch

    def fetch(self, *args, **kwargs):
        queries = [
            bam_query_iterator(bam, *args, **kwargs)
            for bam in self.bam_handles.values()
            for bam in self.handles.values()
        ]
        if self._sort_fetch:
            yield from heapq.merge(*queries, key=itemgetter(1))
        else:
            for q in queries:
                yield from q
        return queries

    def close(self):
        for bam in self.bam_handles.values():
        for bam in self.handles.values():
            bam.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()
class MultiBigWigParser(MultiParser):

    def __init__(self, bw_fns):
        # attempt to infer if stranded, each bw_fn should be comma separated list
        bw_fns = [tuple(fn.split(',')) for fn in bw_fns]
        if all([len(fn) == 2 for fn in bw_fns]):
            stranded = True
        elif all([len(fn) == 1 for fn in bw_fns]):
            stranded = False
        else:
            raise ValueError('Please provide either single bw files or comma separated pos,neg bw files')
        if stranded:
            self.handles = {
                bw_fn: (pybw.open(bw_fn[0]), pybw.open(bw_fn[1])) for bw_fn in bw_fns
            }
        else:
            self.handles = {
                bw_fn: (pybw.open(bw_fn[0]),) for bw_fn in bw_fns
            }
        self.closed = False
        self.stranded = stranded

def parse_exons(record):
    start = int(record[1])
    end = int(record[2])
    exstarts = np.fromstring(record[11], sep=',') + start
    exends = exstarts + np.fromstring(record[10], sep=',')
    exons = np.dstack([exstarts, exends])[0]
    return start, end, exons
    def fetch(self, chrom, start, end, strand=None):
        if strand is not None and not self.stranded:
            raise ValueError('cannot specify strand on unstranded bigwigs')
        if self.stranded:
            if strand == '+':
                queries = [
                    pos_bw.values(chrom, start, end, numpy=True)
                    for pos_bw, neg_bw in self.handles.values()
                ]
            elif strand == '-':
                queries = [
                    neg_bw.values(chrom, start, end, numpy=True)
                    for pos_bw, neg_bw in self.handles.values()
                ]
            elif strand is None:
                queries = [
                    np.nansum([
                        pos_bw.values(chrom, start, end, numpy=True),
                        neg_bw.values(chrom, start, end, numpy=True)
                    ], axis=0)
                    for pos_bw, neg_bw in self.handles.values()
                ]
        else:
            queries = [
                bw[0].values(chrom, start, end, numpy=True)
                for bw in self.handles
            ]
        return queries

    def close(self):
        for bws in self.handles.values():
            for bw in bws:
                bw.close()


def parse_bed_record(record):
def bam_or_bw(fn):
    if fn.endswith('.bam') or fn.endswith('.sam'):
        return True
    elif fn.endswith('.bw') or fn.lower().endswith('.bigwig'):
        return False
    else:
        raise ValueError('files must be bam, sam, or bigwig format')


def parse_inv(record, use_5utr):
    chrom = record[0]
    strand = record[5]
    start = int(record[1])
    end = int(record[2])
    gene_id = record[3]
    start, end, invs = parse_exons(record)
    return chrom, start, end, gene_id, strand, invs
    strand = record[5]
    if not use_5utr:
        cds_start = int(record[6])
        cds_end = int(record[7])
        if cds_start != cds_end:
            # not a protein coding gene
            if strand == '+':
                start = cds_start
            else:
                end = cds_end
    return chrom, start, end, strand, gene_id


def parse_bed_record(record, extend_gene_five_prime=0, use_5utr=True, extend_gene_three_prime=0):
    chrom, start, end, strand, gene_id = parse_inv(record, use_5utr)
    if extend_gene_five_prime:
        if strand == '+':
            start = max(0, start - extend_gene_five_prime)
        else:
            end += extend_gene_five_prime
    if extend_gene_three_prime:
        if strand == '+':
            end += extend_gene_three_prime
        else:
            start = max(0, start - extend_gene_three_prime)
    return chrom, start, end, gene_id, strand


def bed12_iterator(bed_fn):
def bed12_iterator(bed_fn, extend_gene_five_prime, ignore_5utr, extend_gene_three_prime):
    with open(bed_fn) as bed:
        for record in bed:
            record = record.split()
            yield parse_bed_record(record)
            yield parse_bed_record(
                record, extend_gene_five_prime, ignore_5utr, extend_gene_three_prime
            )


def write_output_bed(output_bed_fn, results):
@@ -105,15 +201,28 @@ def write_output_bed(output_bed_fn, results):
        for (
            chrom, start, end, gene_id, score, strand,
            wass_dist, wass_dir, wass_pval, wass_fdr,
            ks_stat, ks_pval, ks_fdr, hm_pval, hm_fdr,
            nreads_cntrl, nreads_treat
        ) in results.itertuples(index=False):

            record = (f'{chrom}\t{int(start):d}\t{int(end):d}\t'
                      f'{gene_id}\t{int(score):d}\t{strand}\t'
                      f'{wass_dist:.1f}\t{wass_dir:.1f}\t'
                      f'{wass_pval:.3g}\t{wass_fdr:.3g}\t'
                      f'{ks_stat:.2f}\t{ks_pval:.3g}\t{ks_fdr:.3g}\t'
                      f'{hm_pval:.3g}\t{hm_fdr:.3g}\t'
                      f'{nreads_cntrl:d}\t{nreads_treat:d}\n')
                      f'{sum(nreads_cntrl):d}\t{sum(nreads_treat):d}\n')
            bed.write(record)


def write_apa_site_bed(output_bed_fn, results):
    with open(output_bed_fn, 'w') as bed:
        for (
            chrom, start, end, gene_id, strand,
            cntrl_count, treat_count,
            cntrl_frac, treat_frac,
            relative_change
        ) in results.itertuples(index=False):
            direction = int(relative_change > 0)
            record = (f'{chrom}\t{start:d}\t{end:d}\t'
                      f'{gene_id}\t{direction:d}\t{strand}\t'
                      f'{cntrl_count:d}\t{treat_count:d}\t'
                      f'{cntrl_frac:.2f}\t{treat_frac:.2f}\t'
                      f'{relative_change:.2f}\n')
            bed.write(record)
 No newline at end of file
+49 −31
Original line number Diff line number Diff line
import click

from .io import write_output_bed
from .io import write_output_bed, write_apa_site_bed
from .ref_guided import ref_guided_diff_tpe


def annot_free_diff_tpe(treat_bam_fns, cntrl_bam_fns,
                        min_read_overlap, bootstraps,
                        log10_transform, processes):
    raise NotImplementedError('TODO!')
    

@click.command()
@click.option('-t', '--treatment-bams', required=True, multiple=True)
@click.option('-c', '--control-bams', required=True, multiple=True)
@click.option('-o', '--output-bed', required=True)
@click.option('-a', '--annotation-bed12', required=False, default=None)
@click.option('-t', '--treatment-fns', required=True, multiple=True)
@click.option('-c', '--control-fns', required=True, multiple=True)
@click.option('-o', '--output-prefix', required=True)
@click.option('--write-apa-sites/--no-apa-sites', required=False, default=True)
@click.option('-a', '--annotation-bed12', required=True)
@click.option('--read-strand', type=click.Choice(['same', 'opposite', 'unstranded']), default='same')
@click.option('--read-end', type=click.Choice(['3', '5']), default='3')
@click.option('--paired-end-read', type=click.Choice(['1', '2', 'both', 'single']), default='single')
@click.option('--min-read-overlap', default=0.2)
@click.option('--bootstraps', default=999)
@click.option('--log10-transform', default=False)
@click.option('--min-reads-per-rep', default=10)
@click.option('--extend-gene-five-prime', default=0)
@click.option('--use-5utr/--ignore-5utr', default=True)
@click.option('--extend-gene-three-prime', default=0)
@click.option('--bootstraps', default=1999)
@click.option('--threshold', default=0.05)
@click.option('--tpe-cluster-sigma', default=15)
@click.option('--min-tpe-reads', default=5)
@click.option('--min-tpe-fractional-change', default=0.1)
@click.option('-p', '--processes', default=4)
def d3pendr(treatment_bams, control_bams, output_bed, annotation_bed12,
            min_read_overlap, bootstraps, log10_transform, processes):
def d3pendr(treatment_fns, control_fns,
            output_prefix, write_apa_sites,
            annotation_bed12,
            read_strand, read_end, paired_end_read,
            min_read_overlap, min_reads_per_rep,
            extend_gene_five_prime, use_5utr,
            extend_gene_three_prime,
            bootstraps, threshold, tpe_cluster_sigma,
            min_tpe_reads, min_tpe_fractional_change,
            processes):
    '''
    d3pendr: Differential 3' End analysis of Nanopore Direct RNAseq

    Identifies differential polyadenylation events in either an
    annotation dependent or (TODO) reference free manner, using
    annotation dependent manner, using
    a permutation test of Wasserstein distance between 3' end 
    distributions and a KS test.

    Outputs bed6 format with extra columns.
    '''
    if annotation_bed12 is None:
        results = annot_free_diff_tpe(
            treatment_bams, control_bams,
            min_read_overlap, bootstraps,
            log10_transform,
            processes
        )
    else:
        results = ref_guided_diff_tpe(

    if paired_end_read == 'single':
        # for options single or both, all reads are used
        paired_end_read = 'both'

    results, apa_sites = ref_guided_diff_tpe(
        annotation_bed12,
            treatment_bams, control_bams,
            min_read_overlap, bootstraps,
            log10_transform,
        treatment_fns, control_fns,
        read_strand, read_end, paired_end_read,
        min_read_overlap, min_reads_per_rep,
        extend_gene_five_prime, use_5utr,
        extend_gene_three_prime,
        bootstraps, threshold, write_apa_sites,
        tpe_cluster_sigma, min_tpe_reads, min_tpe_fractional_change,
        processes
    )
    output_bed = f'{output_prefix}.apa_results.bed'
    write_output_bed(output_bed, results)
    if write_apa_sites:
        apa_site_bed = f'{output_prefix}.apa_sites.bed'
        write_apa_site_bed(apa_site_bed, apa_sites)


if __name__ == '__main__':
+226 −69

File changed.

Preview size limit exceeded, changes collapsed.

+48 −23
Original line number Diff line number Diff line
import itertools as it
import numpy as np
from scipy import stats


def wasserstein_distance_and_direction(u_values, v_values, log10_transform):
def wasserstein_distance_and_direction(u_values, v_values):

    # modified from scipy.stats._cdf_distance

@@ -14,8 +15,6 @@ def wasserstein_distance_and_direction(u_values, v_values, log10_transform):

    # Compute the differences between pairs of successive values of u and v.
    deltas = np.diff(all_values)
    if log10_transform:
        deltas = np.log10(deltas + 1)

    # Get the respective positions of the values of u and v among the values of
    # both distributions.
@@ -36,7 +35,7 @@ def wasserstein_distance_and_direction(u_values, v_values, log10_transform):
def wasserstein_test(u_values, v_values, bootstraps=999):
    # permutation test of wasserstein distance
    # based on the one outlined in https://github.com/cdowd/twosamples
    obs = stats.wasserstein_distance(u_values, v_values)
    wass_dist, wass_dir = wasserstein_distance_and_direction(u_values, v_values)

    # under null hypothesis the samples are drawn from the same distribution
    # so we can make expected wasserstein values by permuting values between
@@ -50,22 +49,48 @@ def wasserstein_test(u_values, v_values, bootstraps=999):
    exp = np.array(exp)

    # bootstrap p value with pseudocount
    p = ((exp >= obs).sum() + 1) / (bootstraps + 1)
    return p
    p_val = ((exp >= wass_dist).sum() + 1) / (bootstraps + 1)
    return wass_dist, wass_dir, p_val


def tpe_stats(tpe_dist_cntrl, tpe_dist_treat, bootstraps=999, log10_transform=False):
    wass_dist, wass_dir = wasserstein_distance_and_direction(
        tpe_dist_cntrl, tpe_dist_treat, log10_transform
def wasserstein_test_with_replicates(u_values, v_values,
                                     bootstraps=999, threshold=0.05):
    # first pool replicates and perform normal wass test
    u_pooled = np.concatenate(u_values)
    v_pooled = np.concatenate(v_values)
    wass_dist, wass_dir, p_val = wasserstein_test(u_pooled, v_pooled, bootstraps=bootstraps)

    # we only bother testing for homogeneity of replicates if the test between
    # replicates is significant
    if p_val <= threshold:

        # produce pairwise distances between replicates for both conditions
        u_self_wass = np.array(
            [stats.wasserstein_distance(u_i, u_j)
             for u_i, u_j in it.combinations(u_values, r=2)]
        )
        v_self_wass = np.array(
            [stats.wasserstein_distance(v_i, v_j)
             for v_i, v_j in it.combinations(v_values, r=2)]
        )
    wass_pval = wasserstein_test(tpe_dist_cntrl, tpe_dist_treat, bootstraps)
    ks_stat, ks_pval = stats.ks_2samp(tpe_dist_cntrl, tpe_dist_treat)

    # combine the two tests using the harmonic mean of the p values
    try:
        hm_pval = stats.hmean([wass_pval, ks_pval])
    except ValueError:
        # if result is so sig that ks_2samp reports a pval of 0.0, hmean throws error
        # use the smallest number that can be represented as float64 as placeholder
        hm_pval = stats.hmean([wass_pval, np.finfo(np.float64).tiny])
    return wass_dist, wass_dir, wass_pval, ks_stat, ks_pval, hm_pval
 No newline at end of file

        # samples are considered homogeneous if the distance between the two
        # conds is greater than all the pairwise distances within conds.
        is_homogeneous = 1 - int((u_self_wass >= wass_dist).any() |
                                 (v_self_wass >= wass_dist).any())
        if not is_homogeneous:
            p_val = 1
    return wass_dist, wass_dir, p_val


def tpe_stats(tpe_dist_cntrl, tpe_dist_treat, bootstraps=999, threshold=0.05):
    if len(tpe_dist_cntrl) == 1:
        wass_dist, wass_dir, wass_pval = wasserstein_test(
            tpe_dist_cntrl[0], tpe_dist_treat[0], bootstraps
        )
    else:
        wass_dist, wass_dir, wass_pval = wasserstein_test_with_replicates(
            tpe_dist_cntrl, tpe_dist_treat, bootstraps, threshold
        )

    return wass_dist, wass_dir, wass_pval
 No newline at end of file
Loading