Commit e01aaaeb authored by Weipeng Mo's avatar Weipeng Mo
Browse files

Add snakefile

parent 140882de
Loading
Loading
Loading
Loading

basecalling.snakefile

0 → 100755
+51 −0
Original line number Diff line number Diff line
'''
@Author       : windz
@Date         : 2020-05-08 09:51:48
@LastEditTime : 2020-05-08 11:26:12
@Description  : A pipline for calling polya tail length

snakemake -j 15 -c 'bsub -J {rulename} -n {threads} -gpu "num=2" -o %J.stdout -e %J.stderr' -s snakefile_for_bascalled

'''

import os
import glob


DIR_NAME = [os.path.basename(fn).split('.')[0] for fn in glob.glob('fast5/*.sm')]
SAMPLE_NAME = [os.getcwd().split('/')[-1]]

rule all:
    input:
        expand('guppy_out/{dir_name}/sequencing_summary.txt', dir_name=DIR_NAME),
        'fast5/split_files_into_dirs.log',
        #expand('{sample_name}.fastq.gz', sample_name = SAMPLE_NAME)


rule split_fast5_files:
    input:
        'script/split_files_into_dirs.py'
    output:
        'fast5/split_files_into_dirs.log'
    threads: 1
    shell:
        '''
python script/split_files_into_dirs.py --dir_path fast5/ --suffix fast5  --split_num 15
        '''


rule run_guppy:
    input:
        'fast5/split_files_into_dirs.log',  # log文件,只是用来等上一步完成
        'fast5/{dir_name}.sm'  # 空文件,guppy输入是目录
    output:
        'guppy_out/{dir_name}/sequencing_summary.txt'
    threads: 8
    params:
        dir='{dir_name}',
        out='guppy_out/{dir_name}/',
        gpu=2
    shell:
        '''
guppy_basecaller -i fast5/{params.dir} -s {params.out} -c dna_r9.4.1_450bps_hac.cfg --recursive --fast5_out --disable_pings --qscore_filtering --device "cuda:all:100%"
        '''

snakefile

0 → 100755
+146 −0
Original line number Diff line number Diff line
'''
@Author       : windz
@Date         : 2020-05-08 09:51:48
LastEditTime : 2020-12-07 09:51:37
@Description  : A pipline for calling polya tail length

    You can run like this:
    snakemake -j 8 -p -c 'bsub -J {rulename} -n {threads} -o log/%J.stdout -e log/%J.stderr'
'''


# create log dir
if not os.path.exists('log'):
    os.mkdir('log')


configfile: 'config.yml'
SAMPLE_NAME=config['sample_name']


rule all:
    input:
        expand('results/{sample_name}.read.info.txt', sample_name=SAMPLE_NAME),
        expand('results/{sample_name}.read.splicing_kinetics.txt', sample_name=SAMPLE_NAME),
        expand('results/{sample_name}.read.rna.ir.stat', sample_name=SAMPLE_NAME),


rule fastq_to_fasta:
    input:
        'guppy_out/pass/',
    output:
        'basecalled_data/{sample_name}.fasta'
    threads: 1
    shell:
        '''
python script/fastqdir2fasta.py --indir {input} --out {output}
        '''


rule mapping_to_genome:
    input:
        'basecalled_data/{sample_name}.fasta'
    output:
        bam='aligned_data/{sample_name}.sorted.bam'
    params:
        genome='/scem/work/mowp/db/Arabidopsis_thaliana/dna/genome.fasta'
    threads: 32
    shell:
        '''
minimap2 -t {threads} -ax splice --secondary=no -G 12000 {params.genome} {input} | samtools sort -@ {threads} -o {output.bam} -
samtools index -@ {threads} {output.bam}
        '''


rule find_3linker:
    input:
        bam='aligned_data/{sample_name}.sorted.bam',
        fasta='basecalled_data/{sample_name}.fasta'
    output:
        'aligned_data/{sample_name}.adapter.result.txt'
    threads: 36
    shell:
        '''
python script/adapterFinder.py --inbam {input.bam} --inseq {input.fasta} --out {output} --threads {threads} --mode 1
        '''


rule polyacaller:
    input:
        adapter_result='aligned_data/{sample_name}.adapter.result.txt',
        sequencing_summary='sequencing_summary.txt',
        fast5_dir='guppy_out/workspace',
    output:
        'aligned_data/{sample_name}.polyA_tail.result.txt'
    threads: 36
    shell:
        '''
python script/PolyACaller.py --inadapter {input.adapter_result} --summary {input.sequencing_summary}  --fast5dir {input.fast5_dir} --out {output} --threads {threads}
        '''


rule identify_read_info:
    input:
        'aligned_data/{sample_name}.sorted.bam'
    output:
        'aligned_data/{sample_name}.read_info.result.txt'
    params:
        'genome_data/exon_intron_pos.repr.bed'
    threads: 1
    shell:
        '''
python script/extract_read_info.py --inbed {params} --inbam {input} --out {output}
        '''


rule merge_info:
    input:
        read_info='aligned_data/{sample_name}.read_info.result.txt',
        adapter='aligned_data/{sample_name}.adapter.result.txt',
        polya='aligned_data/{sample_name}.polyA_tail.result.txt'
    output:
        read_info_result='results/{sample_name}.read.info.txt'
    threads: 1
    params:
        'Nanopore'
    shell:
        '''
export PATH=/scem/work/mowp/miniconda3/envs/R/bin/:$PATH
Rscript script/merge_read_info.R --type {params} --inreadinfo {input.read_info} --inadapter {input.adapter} --inpolya {input.polya} --out {output.read_info_result}
        '''


rule splicing_kinetics:
    input:
        read_info='results/{sample_name}.read.info.txt'
    output:
        splicing_data='results/{sample_name}.read.intron.pos.splicing.txt',
        splicing_kinetics='results/{sample_name}.read.splicing_kinetics.txt',
        figure='results/{sample_name}.read.splicing_kinetics.pdf'
    threads: 1
    params:
        inbed='genome_data/exon_intron_pos.repr.bed',
        select_intron='genome_data/select_introns.txt'
    shell:
        '''
python script/prepare_data_for_splice_kinetics.py --inreadinfo {input.read_info} --inbed {params.inbed} --out {output.splicing_data}
export PATH=/scem/work/mowp/miniconda3/envs/R/bin/:$PATH
Rscript script/plot_intron_splicing_kinetics.R --inrelpos {output.splicing_data} --inreadinfo {input.read_info} --inintron {params.select_intron} --out {output.splicing_kinetics} --pdf {output.figure}
        '''


# Calculate intron retention ratio of polyadenylated transcripts
rule intron_retention_ratio:
    input:
        splicing_data='results/{sample_name}.read.intron.pos.splicing.txt',
        read_info='results/{sample_name}.read.info.txt',
    output:
        rna_ir='results/{sample_name}.read.rna.ir.stat',
        intron_ir='results/{sample_name}.read.intron.ir.stat',
    threads: 1
    shell:
        '''
export PATH=/scem/work/mowp/miniconda3/envs/R/bin/:$PATH
Rscript script/cal_polya_transcript_ir.R --inrelpos {input.splicing_data} --inreadinfo {input.read_info} --outrna {output.rna_ir} --outintron {output.intron_ir}
        '''