#PBS -S /bin/bash
#PBS -q cal-s
#PBS -l nodes=1:ppn=24
#PBS -m a
#PBS -j oe
#PBS -N example

###======================envirment=========================================================
work_folder="/scratch/2020-07-28/chaos"
experiment_name="Casodex0729"
experiment_folder="${work_folder}/${experiment_name}"
hic_essential_files="/home/bio-fangc/local/juicer/hic_essential_files"

cp ${hic_essential_files}/environment_for_hic.R_bac ${hic_essential_files}/environment_for_hic.R
sed -i "s/change_this_to_your_work_folder/\/scratch\/2020-07-28\/chaos/g" ${hic_essential_files}/environment_for_hic.R
sed -i "s/change_this_to_your_experiment_name/$experiment_name/g" ${hic_essential_files}/environment_for_hic.R

juicer="/home/bio-fangc/local/juicer/juicer.sh"
juicer_tools="/home/bio-fangc/local/juicer/common/juicer_tools"
straw="/home/bio-fangc/local/juicer/straw"
bedtools="/home/bio-fangc/local/bedtools-2.29.2/bin/bedtools"

genome_size_file="/home/bio-fangc/Data/Reference/UCSC_hg19/hg19.size"
Digest_file="/home/bio-fangc/Data/Reference/UCSC_hg19/hg19_DpnII.txt"
Reference="/home/bio-fangc/Data/Reference/UCSC_hg19/hg19"
genedensity_folder="/home/bio-fangc/Data/Reference/UCSC_hg19/genedensity"

sample_all_list="${experiment_folder}/sample_all_list"
sample_without_rep_list="${experiment_folder}/sample_without_rep_list"
resolution_list_all="${hic_essential_files}/resolution_list_all"
resolution_list_tad="${hic_essential_files}/resolution_list_tad"
resolution_list_eigenvector="${hic_essential_files}/resolution_list_eigenvector"
chr_list="${hic_essential_files}/chr_list"
overlap_ratio_list="${hic_essential_files}/overlap_ratio_list"
overlap_ratio_list_for_plot="${hic_essential_files}/overlap_ratio_list_for_plot"

analysis_folder="${experiment_folder}/data_analysis"
hic_folder="${analysis_folder}/hic_files"
cool_folder="${analysis_folder}/cool_files"
h5_folder="${analysis_folder}/h5_files"
correlation_folder="${analysis_folder}/correlation"
hicexplorer_tad_folder="${analysis_folder}/TAD_3dgenome"
hicexplorer_tad_data_folder="${hicexplorer_tad_folder}/data"
hicexplorer_tad_overlap_folder="${hicexplorer_tad_folder}/TAD_overlap"
hicexplorer_tad_overlap_plot_folder="${hicexplorer_tad_folder}/TAD_overlap_plot"
arrowhead_tad_folder="${analysis_folder}/TAD_arrowhead"
3dgenome_loop_folder="${analysis_folder}/Loop_3dgenome"
hiccups_loop_folder="${analysis_folder}/Loop_hiccups"
eigenvector_folder="${analysis_folder}/Eigenvector_KR"
eigenvector_corrected_folder="${analysis_folder}/Eigenvector_KR_corrected"
eigenvector_plot_folder="${eigenvector_corrected_folder}/plot"
DistVsCounts_folder="${analysis_folder}/DistVsCounts"
SVL_folder="${analysis_folder}/SVL"
Whole_Chromosome_Positioning_Analysis_folder="${analysis_folder}/Whole_Chromosome_Positioning_Analysis"
inter_matrix_NONE_folder="${Whole_Chromosome_Positioning_Analysis_folder}/inter_matrix_NONE"
intra_matrix_NONE_folder="${Whole_Chromosome_Positioning_Analysis_folder}/intra_matrix_NONE"




mkdir ${analysis_folder}
mkdir ${hic_folder}
mkdir ${correlation_folder}
mkdir ${cool_folder}
mkdir ${h5_folder}
mkdir ${hicexplorer_tad_folder}
mkdir ${hicexplorer_tad_data_folder}
mkdir ${hicexplorer_tad_overlap_folder}
mkdir ${arrowhead_tad_folder}
mkdir ${eigenvector_folder}
mkdir ${eigenvector_corrected_folder}
mkdir ${3dgenome_loop_folder}
mkdir ${hiccups_loop_folder}
mkdir ${DistVsCounts_folder}
mkdir ${SVL_folder}
mkdir ${Whole_Chromosome_Positioning_Analysis_folder}
mkdir ${inter_matrix_NONE_folder}
mkdir ${intra_matrix_NONE_folder}
mkdir ${PCA_folder}


# #=======================make .hic files with juicer-gtz================================== 
# fastq.gz files should be decompressed first
# usually, juicer should be run on a cluster
# The default Restriction Enzyme is DpnII, using -s to set new RE, i.e -s MboI
# cat $sample_all_list | while read i
# do
# 	cp ${hic_essential_files}/juicer_pbs_example.sh ${experiment_folder}/${i}_juicer.sh
# 	sed -i "s/example/$i/g" ${hic_essential_files}/${i}_juicer.sh
# 	qsub ${experiment_folder}/${i}_juicer.sh
# done 
 cat $sample_all_list | while read i
 do
 	/bin/rm -rf ${experiment_folder}/${i}/splits
 	ls ${experiment_folder}/${i}/aligned | while read j
 	do
 		mv ${experiment_folder}/${i}/aligned/${j} ${experiment_folder}/${i}/aligned/${i}_${j}
 	done
 	cp ${experiment_folder}/${i}/aligned/${i}_inter.hic ${analysis_folder}/hic_files/${i}.hic
 done 
 Rscript ${hic_essential_files}/hic_info_generation.R
 cat $sample_all_list | while read i
 do
	gtz ${experiment_folder}/${i}/aligned/${i}_${j}_merged_sort.txt
	gtz ${experiment_folder}/${i}/aligned/${i}_${j}_merged_nodups.txt
	gtz ${experiment_folder}/${i}/aligned/${i}_${j}_opt_dups.txt
	gtz ${experiment_folder}/${i}/aligned/${i}_${j}_dups.txt
	gtz ${experiment_folder}/${i}/aligned/${i}_${j}_collisions_nodups.txt
	gtz ${experiment_folder}/${i}/aligned/${i}_${j}_collisions.txt
	samtools view -Sb ${experiment_folder}/${i}/aligned/${i}_${j}_abnormal.sam > ${experiment_folder}/${i}/aligned/${i}_${j}_abnormal.bam @ 24
	samtools view -Sb ${experiment_folder}/${i}/aligned/${i}_${j}_unmapped.sam > ${experiment_folder}/${i}/aligned/${i}_${j}_unmapped.bam @ 24
	/bin/rm -rf ${experiment_folder}/${i}/aligned/${i}_${j}_merged_sort.txt
	/bin/rm -rf ${experiment_folder}/${i}/aligned/${i}_${j}_merged_nodups.txt
	/bin/rm -rf ${experiment_folder}/${i}/aligned/${i}_${j}_opt_dups.txt
	/bin/rm -rf ${experiment_folder}/${i}/aligned/${i}_${j}_dups.txt
	/bin/rm -rf ${experiment_folder}/${i}/aligned/${i}_${j}_collisions_nodups.txt
	/bin/rm -rf ${experiment_folder}/${i}/aligned/${i}_${j}_collisions.txt
	/bin/rm -rf ${experiment_folder}/${i}/aligned/${i}_${j}_abnormal.sam
	/bin/rm -rf ${experiment_folder}/${i}/aligned/${i}_${j}_unmapped.sam
 done 
# #=========================hic to cool & cool to h5======================================= 
# source activate 3dgenome
# cat $sample_all_list | while read i
# do
# {
# 	hicConvertFormat -m ${hic_folder}/${i}.hic --inputFormat hic --outputFormat cool --correction_name KR -o ${cool_folder}/${i}_KR.cool -r 2500000 1000000 500000 250000 100000 50000 25000 10000 5000 1000 
# 	hicConvertFormat -m ${hic_folder}/${i}.hic --inputFormat hic --outputFormat cool --correction_name KR -o ${cool_folder}/${i}_KR.cool
# }&
# done 
# cat $sample_all_list | while read i
# do
# 	cat $resolution_list_all | while read j
# 	do
# 	{
# 		mv ${cool_folder}/${i}_KR_${j}000.cool ${cool_folder}/${i}_KR_${j}k.cool
# 	}&
# 	done
# done
# echo "All hic files have been converted to cool files."
# cat $sample_all_list | while read i
# do
# 	cat $resolution_list_all | while read j
# 	do
# 	{
# 		echo "Converting ${i}_KR_${j}k.cool -> ${i}_KR_${j}k.h5"
# 		hicConvertFormat -m ${cool_folder}/${i}_KR_${j}k.cool  --inputFormat cool --outputFormat h5 -o  ${h5_folder}/${i}_KR_${j}k.h5
# 		echo "Done"
# 	}&
# 	done
# done
# echo "All cool files have been converted to h5 files."
# #=========================hic matrix correlation========================================= 
# source activate 3dgenome
# cat $resolution_list_all | while read i
# do
# 	hicCorrelate --threads 24 -m ${cool_folder}/*_KR_${i}k.cool --plotFileFormat pdf --plotNumbers --chromosomes 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y --threads 24 -oh ${correlation_folder}/${i}k_correlation_heatmap.pdf -os ${correlation_folder}/${i}k_correlation_scatter.pdf
# done 
# #========================hic matrix correlation data extraction and replot=================
# #need to confirm whether there are the pdftools and tidyverse packages in R
# #BiocManager::install(c("pdftools","tidyverse"))
# #It's not working on SUSTech cluster because pdftools depend on a software poppler which couldn't be installed on cluster 
# #Rscript ${hic_essential_files}/correlation_heatmap_value_extraction_and_replot.R
# #rm -rf Rscript ${hic_essential_files}/correlation_heatmap_value_extraction_and_replot.R 
# #
# #=========================HicExplorer TAD================================================ 
# source activate 3dgenome
# cat $sample_without_rep_list | while read i
# do
# 	cat $resolution_list_tad | while read j
# 	do
# 		hicFindTADs -m ${cool_folder}/${i}_KR_${j}k.cool --outPrefix ${hicexplorer_tad_data_folder}/${i}_KR_${j}k --correctForMultipleTesting fdr  --chromosomes 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y --numberOfProcessors 24
# 	done
# done 
# #=========================Eigenvector==================================================== 
# cat $resolution_list_eigenvector | while read k
# do
# 	mkdir ${eigenvector_folder}/${k} ${eigenvector_corrected_folder}/${k}k
# 	cat $chr_list | while read j
# 	do 
# 		cat $sample_without_rep_list | while read i
# 		do
# 			$juicer_tools eigenvector -p KR ${hic_folder}/${i}.hic $j BP ${k}000  | awk '(NR>2){print $0}' >  ${eigenvector_folder}/${k}k/${i}_chr${j}_eigenvector_KR_${k}k
# 			sed -i 's/NaN/0/g' ${eigenvector_folder}/${k}k/${i}_chr${j}_eigenvector_KR_${k}k
# 			cp ${eigenvector_folder}/${k}k/${i}_chr${j}_eigenvector_KR_${k}k ${eigenvector_corrected_folder}/${k}k/${i}_chr${j}_eigenvector_KR_${k}k
# 			paste ${genedensity_folder}/${k}k/genedensity_${k}k_chr${j}.bed ${eigenvector_corrected_folder}/${k}k/${i}_chr${j}_eigenvector_KR_${k}k > ${eigenvector_corrected_folder}/${k}k/${i}_chr${j}_eigenvector_KR_${k}k_with_genesdensity
# 			cat ${eigenvector_corrected_folder}/${k}k/${i}_chr${j}_eigenvector_KR_${k}k_with_genesdensity | awk -vI=$i -vJ=$j -vK=$k 'BEGIN{sumPos=0;sumNeg=0;pos=0;neg=0}{if($6>0&&$6!="NaN"){pos++;sumPos+=$5}else if($6<0&&$6!="NaN"){neg++;sumNeg+=$5}}END{print I"_chr"J"_eigenvector_KR_"K"k","pos",sumPos,pos,sumPos/pos,"neg",sumNeg,neg,sumNeg/neg,"pos-neg",sumPos/pos-sumNeg/neg}' >> ${eigenvector_corrected_folder}/${k}k/average_gene_density_${k}k.txt
# 		done
# 	done
# done
# Rscript ${hic_essential_files}/eigenvector_correction.R
# cat $resolution_list_eigenvector | while read k
# do
# 	rm -rf ${eigenvector_corrected_folder}/${k}k/*k_with_genesdensity
# done
# Rscript ${hic_essential_files}/compartment_plot.R
# #========================arrowhead TAD=================================================== 
# cat $sample_without_rep_list | while read i
# do
# 	cat $resolution_list_tad | while read j
# 	do
# 		$juicer_tools arrowhead --threads 24 -r ${j}000 -k KR --ignore-sparsity ${hic_folder}/${i}.hic ${arrowhead_tad_folder}/${i}_${j}k_domains
# 		mv ${arrowhead_tad_folder}/${i}_${j}k_domains/${j}000_blocks.bedpe TAD_arrowhead/${i}_${j}k_domains.bedpe
# 		rm -rf TAD_arrowhead/${i}_${j}k_domains
# 	done
# done
# #========================hiccups loop==================================================== 
# cat $sample_without_rep_list | while read i
# do
# 	$juicer_tools hiccups --cpu ${hic_folder}/${i}.hic  -k KR --ignore_sparsity ${hiccups_loop_folder}/${i}_hiccup_loop
# done 
# #========================3dgenome loop================================================ 
# source activate 3dgenome
# cat $sample_without_rep_list | while read i
# do
# 	cat $resolution_list_all | while read j
# 	do
# 		hicDetectLoops --threads 24 --statisticalTest wilcoxon-rank-sum -m ${cool_folder}/${i}_KR_${j}k.cool -o ${3dgenome_loop_folder}/${i}_${j}k_hicexporer_loop
# 	done	
# done 
# #========================DistVsCounts==================================================== 
# source activate 3dgenome
# cat $resolution_list_all | while read i
# do
# 	hicPlotDistVsCounts -m ${cool_folder}/*_${i}k.cool -o ${DistVsCounts}/dist_vs_counts_${i}000.pdf --maxdepth 20000000  --outFileData
# done
# # labels cold be modified with following command  --labels "Casodex_DHT" "Casodex_EtOH" "DMSO_DHT" "DMSO_EtOH" 
# #========================PlotSVL========================================================= 
# source activate 3dgenome
# cat resolution_list_all.txt | while read i
# do 
# 	hicPlotSVL  -m ${cool_folder}/*_${i}k.cool --distance 20000 --threads 24 --plotFileName ${SVL_foler}/SVL_${i}k_plot.png --outFileName ${SVL_foler}/SVL_${i}k_pvalues.txt --outFileNameData ${SVL_foler}/SVL_${i}k_data.txt --chromosomes 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y
# done 
# 
# #========================Whole_Chromosome_Positioning_Analysis==============================
# # since the interactions between chrs are same across different resoutions, 
# # we only need the data under one resolution to save time, 
# # the belowing code extract the interactions at 2500k resolution  
# # matrix at 2.5Mb resolution is enough for WCRA, the rest files are for other purpose
# cat $sample_without_rep_list | while read i
# do
# 	cat $chr_list | while read k
# 	do
# 		cat $chr_list | while read l
# 		do
# 			cat $resolution_list_all | while read j
# 			do
# 			{
# 				if [ $k = $l ]
# 				then
# 					${straw} NONE ${hic_folder}/${i}.hic $k $k BP ${j}000 > ${intra_matrix_NONE_folder}/${i}_chr${k}_NONE_${j}k_matrix.txt
# 				else
# 					${straw} NONE ${hic_folder}/${i}.hic $k $l BP ${j}000 > ${inter_matrix_NONE_folder}/${i}_chr${k}_chr${l}_NONE_${j}k_matrix.txt
# 				fi
# 			}&
# 		done
# 	done
# done
# Rscript ${hic_essential_files}/Whole_Chromosome_Positioning_Analysis.R
# rm -rf ${hic_essential_files}/Whole_Chromosome_Positioning_Analysis.R
# 
# #========================insulation_score_plot=============================================
# 
# #========================TAD overlap=======================================================
# 
# cat $resolution_list_tad | while read i
# do
# 	cat $overlap_ratio_list | while read l
# 	do
# 		cat $sample_without_rep_list | while read j
# 		do
# 			cat $sample_without_rep_list | while read k
# 			do
# 				if [ $j != $k ]
#  				then
#  					$bedtools intersect -wa -wb -a ${hicexplorer_tad_data_folder}/${j}_KR_${i}k_domains.bed -b ${hicexplorer_tad_data_folder}/${k}_KR_${i}k_domains.bed -f $l -r  > ${hicexplorer_tad_overlap_folder}/${j}_overlap_with_${k}_${i}k_overlap_ratio_${l}.domains.bed
#  				fi
#  			done
#  		done
#  	done
# done
# 
# 
# Rscript ${hic_essential_files}/TAD_overlap.R
# rm -rf ${hic_essential_files}/TAD_overlap.R
#============================================================================================