Commit 68945734 authored by mtparker's avatar mtparker
Browse files

init nanopore project

parents
Loading
Loading
Loading
Loading

.gitignore

0 → 100755
+120 −0
Original line number Diff line number Diff line
*.bed
*.h5
data/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

d3pendr/__init__.py

0 → 100755
+0 −0

Empty file added.

d3pendr/invs.py

0 → 100755
+42 −0
Original line number Diff line number Diff line
def intersect(inv_a, inv_b):
    a_start, a_end = inv_a
    b_start, b_end = inv_b
    if a_end < b_start or a_start > b_end:
        return 0
    else:
        s = max(a_start, b_start)
        e = min(a_end, b_end)
        return e - s


def intersect_spliced_invs(invs_a, invs_b):
    score = 0
    invs_a = iter(invs_a)
    invs_b = iter(invs_b)
    a_start, a_end = next(invs_a)
    b_start, b_end = next(invs_b)
    while True:
        if a_end < b_start:
            try:
                a_start, a_end = next(invs_a)
            except StopIteration:
                break
        elif a_start > b_end:
            try:
                b_start, b_end = next(invs_b)
            except StopIteration:
                break
        else:
            score += intersect([a_start, a_end], [b_start, b_end])
            if a_end > b_end:
                try:
                    b_start, b_end = next(invs_b)
                except StopIteration:
                    break
            else:
                try:
                    a_start, a_end = next(invs_a)
                except StopIteration:
                    break
    return score
 No newline at end of file

d3pendr/io.py

0 → 100755
+120 −0
Original line number Diff line number Diff line
import heapq
from operator import itemgetter
import numpy as np
import pysam


def bam_cigar_to_invs(aln):
    invs = []
    start = aln.reference_start
    end = aln.reference_end
    strand = '-' if aln.is_reverse else '+'
    left = start
    right = left
    has_ins = False
    for op, ln in aln.cigar:
        if op in (1, 4, 5):
            # does not consume reference
            continue
        elif op in (0, 2, 7, 8):
            # consume reference but do not add to invs yet
            right += ln
        elif op == 3:
            invs.append([left, right])
            left = right + ln
            right = left
    if right > left:
        invs.append([left, right])
    assert invs[0][0] == start
    assert invs[-1][1] == end
    return start, end, strand, np.array(invs)


def bam_query_iterator(bam, *args, **kwargs):
    strand = kwargs.pop('strand', None)
    if strand is None:
        for aln in bam.fetch(*args, **kwargs):
            yield bam_cigar_to_invs(aln)
    elif strand in '+-':
        is_reverse = strand == '-'
        for aln in bam.fetch(*args, **kwargs):
            if is_reverse == aln.is_reverse:
                yield bam_cigar_to_invs(aln)
    else:
        raise ValueError('strand is not one of +-')
                


class MultiBamParser(object):

    def __init__(self, bam_fns, *, sort_fetch=False):
        self.bam_handles = {
            bam_fn: pysam.AlignmentFile(bam_fn) for bam_fn in bam_fns
        }
        self.closed = False
        self._sort_fetch = sort_fetch

    def fetch(self, *args, **kwargs):
        queries = [
            bam_query_iterator(bam, *args, **kwargs)
            for bam in self.bam_handles.values()
        ]
        if self._sort_fetch:
            yield from heapq.merge(*queries, key=itemgetter(1))
        else:
            for q in queries:
                yield from q

    def close(self):
        for bam in self.bam_handles.values():
            bam.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()


def parse_exons(record):
    start = int(record[1])
    end = int(record[2])
    exstarts = np.fromstring(record[11], sep=',') + start
    exends = exstarts + np.fromstring(record[10], sep=',')
    exons = np.dstack([exstarts, exends])[0]
    return start, end, exons


def parse_bed_record(record):
    chrom = record[0]
    strand = record[5]
    gene_id = record[3]
    start, end, invs = parse_exons(record)
    return chrom, start, end, gene_id, strand, invs


def bed12_iterator(bed_fn):
    with open(bed_fn) as bed:
        for record in bed:
            record = record.split()
            yield parse_bed_record(record)


def write_output_bed(output_bed_fn, results):
    with open(output_bed_fn, 'w') as bed:
        for (
            chrom, start, end, gene_id, score, strand,
            wass_dist, wass_dir, wass_pval, wass_fdr,
            ks_stat, ks_pval, ks_fdr, hm_pval, hm_fdr,
            nreads_cntrl, nreads_treat
        ) in results.itertuples(index=False):

            record = (f'{chrom}\t{int(start):d}\t{int(end):d}\t'
                      f'{gene_id}\t{int(score):d}\t{strand}\t'
                      f'{wass_dist:.1f}\t{wass_dir:.1f}\t'
                      f'{wass_pval:.3g}\t{wass_fdr:.3g}\t'
                      f'{ks_stat:.2f}\t{ks_pval:.3g}\t{ks_fdr:.3g}\t'
                      f'{hm_pval:.3g}\t{hm_fdr:.3g}\t'
                      f'{nreads_cntrl:d}\t{nreads_treat:d}\n')
            bed.write(record)
 No newline at end of file

d3pendr/main.py

0 → 100755
+54 −0
Original line number Diff line number Diff line
import click

from .io import write_output_bed
from .ref_guided import ref_guided_diff_tpe


def annot_free_diff_tpe(treat_bam_fns, cntrl_bam_fns,
                        min_read_overlap, bootstraps,
                        log10_transform, processes):
    raise NotImplementedError('TODO!')
    

@click.command()
@click.option('-t', '--treatment-bams', required=True, multiple=True)
@click.option('-c', '--control-bams', required=True, multiple=True)
@click.option('-o', '--output-bed', required=True)
@click.option('-a', '--annotation-bed12', required=False, default=None)
@click.option('--min-read-overlap', default=0.2)
@click.option('--bootstraps', default=999)
@click.option('--log10-transform', default=False)
@click.option('-p', '--processes', default=4)
def d3pendr(treatment_bams, control_bams, output_bed, annotation_bed12,
            min_read_overlap, bootstraps, log10_transform, processes):
    '''
    d3pendr: Differential 3' End analysis of Nanopore Direct RNAseq

    Identifies differential polyadenylation events in either an
    annotation dependent or (TODO) reference free manner, using
    a permutation test of Wasserstein distance between 3' end 
    distributions and a KS test.

    Outputs bed6 format with extra columns.
    '''
    if annotation_bed12 is None:
        results = annot_free_diff_tpe(
            treatment_bams, control_bams,
            min_read_overlap, bootstraps,
            log10_transform,
            processes
        )
    else:
        results = ref_guided_diff_tpe(
            annotation_bed12,
            treatment_bams, control_bams,
            min_read_overlap, bootstraps,
            log10_transform,
            processes
        )
    write_output_bed(output_bed, results)


if __name__ == '__main__':
    d3pendr()
 No newline at end of file