Commit ac18b0b9 authored by Weipeng Mo's avatar Weipeng Mo
Browse files

update snakefile for reproducibility.

parent 1aa89fac
Loading
Loading
Loading
Loading
+8 −1
Original line number Diff line number Diff line
@@ -21,6 +21,13 @@ This pipeline includes basecalling and preprocessing of the FLEP-seq data.
  * Tidyverse (https://www.tidyverse.org/)
  * optparse (https://cran.r-project.org/web/packages/optparse/index.html)

**One-step installation**

Install FLEP-seq pipeline, its dependencies, and other required packages in one step using conda or mamba:

```
mamba create -n flepseq minimap2 pysam click ont-fast5-api joblib r-tidyverse r-optparse samtools pandas
```

## Inputs

environment.yml

0 → 100644
+237 −0
Original line number Diff line number Diff line
name: flepseq
channels:
  - bioconda
  - conda-forge
  - defaults
dependencies:
  - _libgcc_mutex=0.1=conda_forge
  - _openmp_mutex=4.5=2_gnu
  - _r-mutex=1.0.1=anacondar_1
  - binutils_impl_linux-64=2.40=hf600244_0
  - bwidget=1.9.14=ha770c72_1
  - bzip2=1.0.8=h7f98852_4
  - c-ares=1.19.0=h5eee18b_0
  - ca-certificates=2023.01.10=h06a4308_0
  - cached-property=1.5.2=hd8ed1ab_1
  - cached_property=1.5.2=pyha770c72_1
  - cairo=1.16.0=ha61ee94_1014
  - click=8.1.3=py310hff52083_1
  - curl=7.87.0=h6312ad2_0
  - expat=2.5.0=hcb278e6_1
  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
  - font-ttf-inconsolata=3.000=h77eed37_0
  - font-ttf-source-code-pro=2.038=h77eed37_0
  - font-ttf-ubuntu=0.83=hab24e00_0
  - fontconfig=2.14.2=h14ed4e7_0
  - fonts-conda-ecosystem=1=0
  - fonts-conda-forge=1=0
  - freetype=2.12.1=hca18f0e_1
  - fribidi=1.0.10=h36c2ea0_0
  - gcc_impl_linux-64=12.2.0=hcc96c02_19
  - gettext=0.21.1=h27087fc_0
  - gfortran_impl_linux-64=12.2.0=h55be85b_19
  - graphite2=1.3.14=h295c915_1
  - gsl=2.7.1=h6e86dc7_1
  - gxx_impl_linux-64=12.2.0=hcc96c02_19
  - h5py=3.8.0=nompi_py310ha66b2ad_101
  - harfbuzz=6.0.0=h8e241bc_0
  - hdf5=1.14.0=nompi_h5231ba7_103
  - htslib=1.17=h6bc39ce_0
  - icu=70.1=h27087fc_0
  - joblib=1.2.0=pyhd8ed1ab_0
  - jpeg=9e=h0b41bf4_3
  - k8=0.2.5=hd03093a_2
  - kernel-headers_linux-64=2.6.32=he073ed8_15
  - keyutils=1.6.1=h166bdaf_0
  - krb5=1.20.1=hf9c8cef_0
  - ld_impl_linux-64=2.40=h41732ed_0
  - lerc=4.0.0=h27087fc_0
  - libaec=1.0.6=hcb278e6_1
  - libblas=3.9.0=16_linux64_openblas
  - libcblas=3.9.0=16_linux64_openblas
  - libcurl=7.87.0=h6312ad2_0
  - libdeflate=1.13=h166bdaf_0
  - libedit=3.1.20221030=h5eee18b_0
  - libev=4.33=h516909a_1
  - libexpat=2.5.0=hcb278e6_1
  - libffi=3.4.2=h7f98852_5
  - libgcc-devel_linux-64=12.2.0=h3b97bd3_19
  - libgcc-ng=12.2.0=h65d4601_19
  - libgfortran-ng=12.2.0=h69a702a_19
  - libgfortran5=12.2.0=h337968e_19
  - libglib=2.74.1=h606061b_1
  - libgomp=12.2.0=h65d4601_19
  - libiconv=1.17=h166bdaf_0
  - liblapack=3.9.0=16_linux64_openblas
  - libnghttp2=1.51.0=hdcd2b5c_0
  - libnsl=2.0.0=h7f98852_0
  - libopenblas=0.3.21=pthreads_h78a6416_3
  - libpng=1.6.39=h753d276_0
  - libsanitizer=12.2.0=h46fd767_19
  - libsqlite=3.40.0=h753d276_0
  - libssh2=1.10.0=haa6b8db_3
  - libstdcxx-devel_linux-64=12.2.0=h3b97bd3_19
  - libstdcxx-ng=12.2.0=h46fd767_19
  - libtiff=4.4.0=h0e0dad5_3
  - libuuid=2.38.1=h0b41bf4_0
  - libwebp-base=1.3.0=h0b41bf4_0
  - libxcb=1.13=h7f98852_1004
  - libxml2=2.10.3=hca2bb57_4
  - libzlib=1.2.13=h166bdaf_4
  - lz4-c=1.9.4=hcb278e6_0
  - make=4.3=hd18ef5c_1
  - minimap2=2.24=h7132678_1
  - ncurses=6.4=h6a678d5_0
  - numpy=1.24.2=py310h8deb116_0
  - ont-fast5-api=4.1.1=pyhdfd78af_0
  - openssl=1.1.1t=h0b41bf4_0
  - packaging=23.1=pyhd8ed1ab_0
  - pandas=2.0.0=py310h9b08913_0
  - pandoc=2.19.2=h32600fe_2
  - pango=1.50.14=hd33c08f_0
  - pcre2=10.40=hc3806b6_0
  - pip=23.0.1=pyhd8ed1ab_0
  - pixman=0.40.0=h36c2ea0_0
  - progressbar33=2.4=py_0
  - pthread-stubs=0.4=h36c2ea0_1001
  - pysam=0.21.0=py310hff46b53_0
  - python=3.10.8=h257c98d_0_cpython
  - python-dateutil=2.8.2=pyhd8ed1ab_0
  - python-tzdata=2023.3=pyhd8ed1ab_0
  - python_abi=3.10=3_cp310
  - pytz=2023.3=pyhd8ed1ab_0
  - r-askpass=1.1=r42h06615bd_3
  - r-assertthat=0.2.1=r42hc72bb7e_3
  - r-backports=1.4.1=r42h06615bd_1
  - r-base=4.2.2=h6b4767f_2
  - r-base64enc=0.1_3=r42h06615bd_1005
  - r-bit=4.0.5=r42h06615bd_0
  - r-bit64=4.0.5=r42h06615bd_1
  - r-blob=1.2.4=r42hc72bb7e_0
  - r-broom=1.0.4=r42hc72bb7e_0
  - r-bslib=0.4.2=r42hc72bb7e_0
  - r-cachem=1.0.7=r42h133d619_0
  - r-callr=3.7.3=r42hc72bb7e_0
  - r-cellranger=1.1.0=r42hc72bb7e_1005
  - r-cli=3.6.1=r42h38f115c_0
  - r-clipr=0.8.0=r42hc72bb7e_1
  - r-colorspace=2.1_0=r42h133d619_0
  - r-cpp11=0.4.3=r42hc72bb7e_0
  - r-crayon=1.5.2=r42hc72bb7e_1
  - r-curl=4.3.3=r42h06615bd_1
  - r-data.table=1.14.8=r42h133d619_0
  - r-dbi=1.1.3=r42hc72bb7e_1
  - r-dbplyr=2.3.2=r42hc72bb7e_0
  - r-digest=0.6.31=r42h38f115c_0
  - r-dplyr=1.1.1=r42h38f115c_0
  - r-dtplyr=1.3.1=r42hc72bb7e_0
  - r-ellipsis=0.3.2=r42h06615bd_1
  - r-evaluate=0.20=r42hc72bb7e_0
  - r-fansi=1.0.4=r42h133d619_0
  - r-farver=2.1.1=r42h7525677_1
  - r-fastmap=1.1.1=r42h38f115c_0
  - r-fontawesome=0.5.0=r42hc72bb7e_0
  - r-forcats=1.0.0=r42hc72bb7e_0
  - r-fs=1.6.1=r42h38f115c_0
  - r-gargle=1.3.0=r42h785f33e_0
  - r-generics=0.1.3=r42hc72bb7e_1
  - r-getopt=1.20.3=r42ha770c72_3
  - r-ggplot2=3.4.2=r42hc72bb7e_0
  - r-glue=1.6.2=r42h06615bd_1
  - r-googledrive=2.1.0=r42hc72bb7e_0
  - r-googlesheets4=1.1.0=r42h785f33e_0
  - r-gtable=0.3.3=r42hc72bb7e_0
  - r-haven=2.5.2=r42h38f115c_0
  - r-highr=0.10=r42hc72bb7e_0
  - r-hms=1.1.3=r42hc72bb7e_0
  - r-htmltools=0.5.5=r42h38f115c_0
  - r-httr=1.4.5=r42hc72bb7e_0
  - r-ids=1.0.1=r42hc72bb7e_2
  - r-isoband=0.2.7=r42h38f115c_1
  - r-jquerylib=0.1.4=r42hc72bb7e_1
  - r-jsonlite=1.8.4=r42h133d619_0
  - r-knitr=1.42=r42hc72bb7e_1
  - r-labeling=0.4.2=r42hc72bb7e_2
  - r-lattice=0.21_8=r42h133d619_0
  - r-lifecycle=1.0.3=r42hc72bb7e_1
  - r-lubridate=1.9.2=r42h133d619_1
  - r-magrittr=2.0.3=r42h06615bd_1
  - r-mass=7.3_58.3=r42h133d619_0
  - r-matrix=1.5_4=r42he1ae0d6_0
  - r-memoise=2.0.1=r42hc72bb7e_1
  - r-mgcv=1.8_42=r42he1ae0d6_0
  - r-mime=0.12=r42h06615bd_1
  - r-modelr=0.1.11=r42hc72bb7e_0
  - r-munsell=0.5.0=r42hc72bb7e_1005
  - r-nlme=3.1_162=r42hac0b197_0
  - r-openssl=2.0.5=r42hb1dc35e_0
  - r-optparse=1.7.3=r42hc72bb7e_1
  - r-pillar=1.9.0=r42hc72bb7e_0
  - r-pkgconfig=2.0.3=r42hc72bb7e_2
  - r-prettyunits=1.1.1=r42hc72bb7e_2
  - r-processx=3.8.0=r42h06615bd_0
  - r-progress=1.2.2=r42hc72bb7e_3
  - r-ps=1.7.4=r42h133d619_0
  - r-purrr=1.0.1=r42h133d619_0
  - r-r6=2.5.1=r42hc72bb7e_1
  - r-rappdirs=0.3.3=r42h06615bd_1
  - r-rcolorbrewer=1.1_3=r42h785f33e_1
  - r-rcpp=1.0.10=r42h38f115c_0
  - r-readr=2.1.4=r42h38f115c_0
  - r-readxl=1.4.2=r42h81ef4d7_0
  - r-rematch=1.0.1=r42hc72bb7e_1005
  - r-rematch2=2.1.2=r42hc72bb7e_2
  - r-reprex=2.0.2=r42hc72bb7e_1
  - r-rlang=1.1.0=r42h38f115c_0
  - r-rmarkdown=2.21=r42hc72bb7e_0
  - r-rstudioapi=0.14=r42hc72bb7e_1
  - r-rvest=1.0.3=r42hc72bb7e_1
  - r-sass=0.4.5=r42h38f115c_0
  - r-scales=1.2.1=r42hc72bb7e_1
  - r-selectr=0.4_2=r42hc72bb7e_2
  - r-stringi=1.7.12=r42h1ae9187_0
  - r-stringr=1.5.0=r42h785f33e_0
  - r-sys=3.4.1=r42h06615bd_0
  - r-tibble=3.2.1=r42h133d619_1
  - r-tidyr=1.3.0=r42h38f115c_0
  - r-tidyselect=1.2.0=r42hc72bb7e_0
  - r-tidyverse=1.3.2=r42hc72bb7e_1
  - r-timechange=0.2.0=r42h38f115c_0
  - r-tinytex=0.44=r42hc72bb7e_0
  - r-tzdb=0.3.0=r42h7525677_1
  - r-utf8=1.2.3=r42h133d619_0
  - r-uuid=1.1_0=r42h06615bd_1
  - r-vctrs=0.6.1=r42h38f115c_0
  - r-viridislite=0.4.1=r42hc72bb7e_1
  - r-vroom=1.6.1=r42h38f115c_0
  - r-withr=2.5.0=r42hc72bb7e_1
  - r-xfun=0.38=r42h38f115c_0
  - r-xml2=1.3.3=r42h044e5c7_2
  - r-yaml=2.3.7=r42h133d619_0
  - readline=8.2=h8228510_1
  - samtools=1.17=h00cdaf9_0
  - sed=4.8=he412f7d_0
  - setuptools=67.6.1=pyhd8ed1ab_0
  - six=1.16.0=pyh6c4a22f_0
  - sysroot_linux-64=2.12=he073ed8_15
  - tar=1.34=hb2e2bae_1
  - tk=8.6.12=h27826a3_0
  - tktable=2.10=hb7b940f_3
  - tzdata=2023c=h71feb2d_0
  - wheel=0.40.0=pyhd8ed1ab_0
  - xorg-kbproto=1.0.7=h7f98852_1002
  - xorg-libice=1.0.10=h7f98852_0
  - xorg-libsm=1.2.3=hd9c2040_1000
  - xorg-libx11=1.8.4=h0b41bf4_0
  - xorg-libxau=1.0.9=h7f98852_0
  - xorg-libxdmcp=1.1.3=h7f98852_0
  - xorg-libxext=1.3.4=h0b41bf4_2
  - xorg-libxrender=0.9.10=h7f98852_1003
  - xorg-libxt=1.2.1=h7f98852_2
  - xorg-renderproto=0.11.1=h7f98852_1002
  - xorg-xextproto=7.3.0=h0b41bf4_1003
  - xorg-xproto=7.0.31=h7f98852_1007
  - xz=5.2.10=h5eee18b_1
  - zlib=1.2.13=h166bdaf_4
  - zstd=1.5.5=hc292b87_0
prefix: /work/bio-mowp/anaconda3/envs/flepseq
+4 −1
Original line number Diff line number Diff line
minimap2
pysam
numpy
pandas
samtools
click
ont-fast5-api
joblib

run_guppy.sh

0 → 100644
+19 −0
Original line number Diff line number Diff line
#BSUB -J 01_guppy
#BSUB -n 20
#BSUB -o %J.stdout
#BSUB -e %J.stderr
#BSUB -R span[hosts=1]
#BSUB -q gpu


export PATH=/work/bio-mowp/software/ont-guppy-4.0.11/bin:$PATH

guppy_basecaller \
  -i fast5 \
  -s guppy_out \
  -c dna_r9.4.1_450bps_hac.cfg \
  --recursive \
  --disable_pings \
  --fast5_out \
  --qscore_filtering \
  --device "cuda:all:100%"
+9 −12
Original line number Diff line number Diff line
'''
Author:
    windz
Usage:
    snakemake -j 15 --cluster 'bsub -J {rulename} -n {threads} -o log/%J.stdout -e log/%J.stderr -R span[hosts=1]' -s flepseq.smk
'''

# create log dir
if not os.path.exists('log'):
    os.mkdir('log')


configfile: 'config.yml'
sample=config['sample']
genome_data=config['genome']
@@ -28,6 +21,7 @@ rule fastq_to_fasta:
    output:
        'basecalled_data/{sample}.fasta'
    threads: 1
    conda: 'flepseq'
    shell:
        '''
python script/fastqdir2fasta.py --indir {input} --out {output}
@@ -42,6 +36,7 @@ rule mapping_to_genome:
    params:
        genome=genome_data
    threads: 36
    conda: 'flepseq'
    shell:
        '''
minimap2 -t {threads} -ax splice --secondary=no -G 12000 {params.genome} {input} | samtools sort -@ {threads} -o {output.bam} -
@@ -56,6 +51,7 @@ rule find_3linker:
    output:
        'aligned_data/{sample}.adapter.result.txt'
    threads: 36
    conda: 'flepseq'
    shell:
        '''
python script/adapterFinder.py --inbam {input.bam} --inseq {input.fasta} --out {output} --threads {threads} --mode 1
@@ -70,6 +66,7 @@ rule polyacaller:
    output:
        'aligned_data/{sample}.polyA_tail.result.txt'
    threads: 36
    conda: 'flepseq'
    shell:
        '''
python script/PolyACaller.py --inadapter {input.adapter_result} --summary {input.sequencing_summary}  --fast5dir {input.fast5_dir} --out {output} --threads {threads}
@@ -84,6 +81,7 @@ rule identify_read_info:
    params:
        'genome_data/exon_intron_pos.repr.bed'
    threads: 1
    conda: 'flepseq'
    shell:
        '''
python script/extract_read_info.py --inbed {params} --inbam {input} --out {output}
@@ -100,9 +98,9 @@ rule merge_info:
    threads: 1
    params:
        'Nanopore'
    conda: 'flepseq'
    shell:
        '''
export PATH=/public/home/mowp/anaconda3/envs/R/bin:$PATH
Rscript script/merge_read_info.R --type {params} --inreadinfo {input.read_info} --inadapter {input.adapter} --inpolya {input.polya} --out {output.read_info_result}
        '''

@@ -118,10 +116,10 @@ rule splicing_kinetics:
    params:
        inbed='genome_data/exon_intron_pos.repr.bed',
        select_intron='genome_data/select_introns.txt'
    conda: 'flepseq'
    shell:
        '''
python script/prepare_data_for_splice_kinetics.py --inreadinfo {input.read_info} --inbed {params.inbed} --out {output.splicing_data}
export PATH=/public/home/mowp/anaconda3/envs/R/bin:$PATH
Rscript script/plot_intron_splicing_kinetics.R --inrelpos {output.splicing_data} --inreadinfo {input.read_info} --inintron {params.select_intron} --out {output.splicing_kinetics} --pdf {output.figure}
        '''

@@ -135,9 +133,8 @@ rule intron_retention_ratio:
        rna_ir='results/{sample}.read.rna.ir.stat',
        intron_ir='results/{sample}.read.intron.ir.stat',
    threads: 1
    conda: 'flepseq'
    shell:
        '''
export PATH=/public/home/mowp/anaconda3/envs/R/bin:$PATH
Rscript script/cal_polya_transcript_ir.R --inrelpos {input.splicing_data} --inreadinfo {input.read_info} --outrna {output.rna_ir} --outintron {output.intron_ir}
        '''