Unverified Commit a78a9653 authored by clintmil's avatar clintmil Committed by GitHub
Browse files

Merge branch 'Tarela:main' into main

parents db5b3c85 3385b5d1
Loading
Loading
Loading
Loading
+154 −0
Original line number Diff line number Diff line
#!/bin/bash
export RASQUALDIR=/apps/software/standard/compiler/gcc/9.2.0/rasqual/20191124
printf 'RDIR=%s' $RASQUALDIR > /dev/stderr

if [ ${#@} -le 3 ] ; then
    printf '\nUsage: %s [paired_end|single_end] bam_list_file vcf_input_file vcf_output_file assay_type\n' "${0}" > /dev/stderr;
    printf '\n       assay_type can be \"atac\", \"rna\", or alternatively not specified (general).\n\n' > /dev/stderr;
    exit 1;
fi


PAIRED_OR_SINGLE_END="${1}";
BAM_LIST_FILENAME="${2}";
VCF_INPUT_FILENAME="${3}";
VCF_OUTPUT_FILENAME="${4}";
ASSAY_TYPE="${5}"
SINGLE_END="";

if [ "${PAIRED_OR_SINGLE_END}" = "single_end" ] ; then
    SINGLE_END="-singleEnd";
elif [ "${PAIRED_OR_SINGLE_END}" = "paired_end" ] ; then
    SINGLE_END="";
else
    printf 'Error: Specify read type: "single_end" or "paired_end"\n' > /dev/stderr;
    exit 1;
fi

MIN_INSERT=1
MAX_INSERT=270000000
if [ "${ASSAY_TYPE}" = "atac" ] ; then
    printf 'Assay type : ATAC-seq\n' > /dev/stderr;
    MIN_INSERT=37;
    MAX_INSERT=10000;
elif [ "${ASSAY_TYPE}" = "rna" ] ; then
    printf 'Assay type : RNA-seq\n' > /dev/stderr;
    MIN_INSERT=37;
    MAX_INSERT=2473537
fi

if [ ! -f "${BAM_LIST_FILENAME}" ] ; then
    printf 'Error: BAM list file "%s" does not exist.\n' "${BAM_LIST_FILENAME}" > /dev/stderr;
    exit 1;
fi

if [ ! -f "${VCF_INPUT_FILENAME}" ] ; then
    printf 'Error: VCF file "%s" does not exist.\n' "${VCF_INPUT_FILENAME}" > /dev/stderr;
    exit 1;
fi

if [ "${VCF_INPUT_FILENAME}" = "${VCF_INPUT_FILENAME%.vcf.gz}" ] ; then
    printf 'Error: VCF file "%s" must end with ".vcf.gz" (created with bgzip).\n' "${VCF_INPUT_FILENAME}" > /dev/stderr;
    exit 1;
fi


if [ -f "${VCF_OUTPUT_FILENAME}" ] ; then
    printf 'Error: VCF output file "%s" does already exist.\n' "${VCF_OUTPUT_FILENAME}" > /dev/stderr;
    exit 1;
fi


# Set ASVCF specific temp directory if the default temp dir is not big enough.
ASVCF_TMP_DIR="";
# Create directory for temporary files.
#   - if ${ASVCF_TMP_DIR} variable is not empty, create a subdir in that directory.
#   - else create a subdirectory in ${TMPDIR}.
#   - else if ${TMPDIR} is not set, create a subdirectory in /tmp.
ASVCF_CURRENT_TMP_DIR=$(mktemp -p "${ASVCF_TMP_DIR:=${TMPDIR}}" -d "ASVCF_${VCF_FILENAME##*/}_XXXXXX");

if [ ! -d "${ASVCF_CURRENT_TMP_DIR}" ] ; then
    printf 'Error: Unable to create temp directory "%s".\n' "${ASVCF_CURRENT_TMP_DIR}" > /dev/stderr;
    exit 1;
fi


ALL_ASC_FILENAME="${ASVCF_CURRENT_TMP_DIR}/all_asc.as.gz";


# Indexed array to keep track of the order of the BAM filenames as specified in ${BAM_LIST_FILENAME}.
declare -a BAM_FILES
# Indexed array to keep track of the order of the AS filenames.
declare -a AS_FILES

for BAM_FILENAME in $(cat "${BAM_LIST_FILENAME}") ; do
    if [ ! -f "${BAM_FILENAME}" ] ; then
        printf 'Error: BAM file "%s" does not exist.\n' "${BAM_FILENAME}" > /dev/stderr;
        exit 1;
    fi

    # Get the sample name by taking the basename of the BAM file withotu the ".bam" extension.
    SAMPLE_NAME="${BAM_FILENAME##*/}";
    SAMPLE_NAME="${SAMPLE_NAME%.bam}";

    AS_FILENAME="${ASVCF_CURRENT_TMP_DIR}/${SAMPLE_NAME}.as.gz";

    BAM_FILES+=("${BAM_FILENAME}");
    AS_FILES+=("${AS_FILENAME}");
done


# Count AS reads.
for CHR in $(tabix -l "${VCF_INPUT_FILENAME}") ; do
    # Create a temporary VCF file for the current chromosome only.
    TMP_VCF_FILENAME="${ASVCF_CURRENT_TMP_DIR}/${VCF_INPUT_FILENAME##*/}"

    tabix "${VCF_INPUT_FILENAME}" "${CHR}" \
        | cut -f 1-9 \
        | gzip \
        > "${TMP_VCF_FILENAME}";

    START=$(zcat "${TMP_VCF_FILENAME}" | head -n 1 | cut -f 2);
    END=$(zcat "${TMP_VCF_FILENAME}" | tail -n 1 | cut -f 2);

    echo "${CHR}:${START}-${END}";

    # Loop over each BAM file.
    for BAM_FILENAME_IDX in "${!BAM_FILES[@]}" ; do
        BAM_FILENAME="${BAM_FILES[${BAM_FILENAME_IDX}]}";
        AS_FILENAME="${AS_FILES[${BAM_FILENAME_IDX}]}";

        # Filter out secondary alignments from the BAM file.
        samtools view -F 0x0100 "${BAM_FILENAME}" "${CHR}:${START}-${END}" \
            | awk '$7=="="{print}' \
            | sort -k 1 \
            | "${RASQUALDIR}/src/ASVCF/qcFilterBam" stdin -skipMissing F -maxMismatch 3 -maxGapOpen 0 -maxBestHit 1 -minQual 10 -minInsert "${MIN_INSERT}" -maxInsert "${MAX_INSERT}" "${PAIRED_END}" \
            | sort -k 2 -n \
            | "${RASQUALDIR}/src/ASVCF/countAS" "${TMP_VCF_FILENAME}" \
            | awk -F '\t' '{ print $5 "," $6; }' \
            | gzip \
            >> "${AS_FILENAME}";
    done

    # Remove temporary VCF file.
    rm "${TMP_VCF_FILENAME}";
done


# Paste all AS counts in one file.
"${RASQUALDIR}/src/ASVCF/zpaste" "${AS_FILES[@]}" > "${ALL_ASC_FILENAME}";


# Remove all temporary AS files.
rm "${AS_FILES[@]}";


# merge AS counts and VCF.
"${RASQUALDIR}/src/ASVCF/pasteFiles" "${VCF_INPUT_FILENAME}" "${ALL_ASC_FILENAME}" \
    | bgzip \
    > "${VCF_OUTPUT_FILENAME}";


# Remove temporary files and directories.
rm "${ALL_ASC_FILENAME}";
rmdir "${ASVCF_CURRENT_TMP_DIR}";
+20 −0
Original line number Diff line number Diff line
#!/bin/bash
#SBATCH -N 1
#SBATCH --cpus-per-task=10
#SBATCH --mem=80000
#SBATCH --time=1-00:00:00
#SBATCH --partition=standard
#SBATCH --account=cphg-millerlab
module load gcc/9.2.0
module load gsl/2.4
module load rasqual
module load tabix
module load gparallel/20170822 
n_lines=$(wc -l bulk_35_input/rasqual.Bulk.35.input.txt | cut -d" " -f1)
parallel -j10 bash rasqual.perm_bulk_replication.sh \
bulk_35_input/rasqual.Bulk.35.input.txt {} \
bulk_35_input/Y.bin \
bulk_35_input/K.bin \
bulk_35_input/X.bin \
asvcf/bulk_coronary_ATAC_AS_subset35.vcf.gz \
bulk_35_output_perm1 ::: `seq $n_lines`
+43 −0
Original line number Diff line number Diff line
# run RASQUAL
# bash rasqual.sh <input parameter file> <line number> Y.bin K.bin X.bin VCF
param_file=$1
line_num=$2
Y=$3
K=$4
X=$5
vcf_file=$6
out_dir=$7
mkdir -p $out_dir

param=($(cat $1 | sed "${line_num}q;d"))
gene_id=${param[0]}
gene_name=${param[1]}
region=${param[2]}
n_rsnp=${param[3]}
n_fsnp=${param[4]}
exon_start_positions=${param[5]}
exon_end_positions=${param[6]}
feat_id=$(grep $gene_id -n bulk_35_input/Y.txt | cut -d":" -f1,1)
window_size=20402
n_sample=35
echo id: $gene_id 
echo name: $gene_name 
echo region: $region
echo reference snps: $n_rsnp
echo feature snps: $n_fsnp
echo feature id: $feat_id

if [[ -e $out_dir/${gene_id}_${gene_name}.txt ]]; then 
	echo $out_dir/${gene_id}_${gene_name}.txt exist! skipping...
else  
	tabix $vcf_file $region | \
	rasqual \
	-y $Y -k $K -x $X \
	-n $n_sample -j $feat_id -l $n_rsnp -m $n_fsnp \
    -s $exon_start_positions -e $exon_end_positions \
    --imputation-quality 0.8 --imputation-quality-fsnp 0.8 \
    --cis-window-size $window_size \
    -f $gene_name --n_threads 10 \
    --random-permutation -t -p 5 \
    --force -v > $out_dir/${gene_id}_${gene_name}.txt
fi 
+20 −0
Original line number Diff line number Diff line
#!/bin/bash
#SBATCH -N 1
#SBATCH --cpus-per-task=10
#SBATCH --mem=80000
#SBATCH --time=1-0:00:00
#SBATCH --partition=standard
#SBATCH --account=cphg-millerlab-VIP
module load gcc/9.2.0
module load gsl/2.4
module load rasqual
module load tabix
module load gparallel/20170822 
n_lines=$(wc -l bulk_35_input/rasqual.Bulk.35.input.txt | cut -d" " -f1)
parallel -j10 bash rasqual_bulk_replication.sh \
bulk_35_input/rasqual.Bulk.35.input.txt {} \
bulk_35_input/Y.bin \
bulk_35_input/K.bin \
bulk_35_input/X.bin \
asvcf/bulk_coronary_ATAC_AS_subset35.vcf.gz \
bulk_35_output ::: `seq $n_lines`
+13 −0
Original line number Diff line number Diff line
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=20
#SBATCH --mem=60000
#SBATCH --time=3-0:00:00
#SBATCH --partition=standard
#SBATCH --account=cphg-millerlab
module load gcc/9.2.0
module load samtools
module load tabix
module load rasqual
bash ./createASVCF.sh paired_end bam.list.bulk_2.txt UVA_hg38_coronary_bulk_ATAC_2_filtered.vcf.gz bulk_coronary_ATAC_AS_subset35.vcf.gz atac
Loading