Unverified Commit 119e5083 authored by hy298's avatar hy298 Committed by GitHub
Browse files

Delete ChIP-seq directory

parent 0c6c95b3
Loading
Loading
Loading
Loading

ChIP-seq/README.md

deleted100644 → 0
+0 −30
Original line number Diff line number Diff line
# histone modifications - data mapping and peak calling

## reads mapping
get_data.sh
run_getData.sh

## process reads
process_aligned_reads.sh
run_process_aligned_reads.sh

## peakcalling for H3K4me1 and H3K4me3 (using HOMER)
findpeaks.sh
run_findpeaks.sh
filter_peaks.sh
filter_overlapped_intervals.py
filterpeaks_based_on_index.py

## define H3K4me1-marked enhancers
enhancer_calling.R

## peakcalling for H3K27ac (using MACS2)
peakCalling_H3K27ac.sh
run_peakCalling_H3K27ac.sh
run_MACS.NoModel.pl





ChIP-seq/enhancer_calling.R

deleted100644 → 0
+0 −75
Original line number Diff line number Diff line
library(GenomicRanges)
library(GenomicAlignments)

#######################################################################
# H3K4me1 peakset
#######################################################################
peakFile=read.delim("./H3K4me1.bed", header = FALSE, sep ="\t")
peaks=GRanges(seqnames=peakFile$V1, IRanges(peakFile$V2,peakFile$V3),tagcount=peakFile$V5,peakName=peakFile$V7)
peakSet=keepStandardChromosomes(trim(resize(peaks, width=1000+width(peaks), fix="center"))) #146421 peaks

#######################################################################
#  Map H3K4me1 read counts on H3K4me1 peaks
#######################################################################
H3K4me1_bamFilePath="/serenity/data/HAYUN/ChIP-seq/catalog/bamfiles/H3K4me1"
H3K4me1_bamFiles = dir(H3K4me1_bamFilePath, pattern="*.bam$", full.name=T)
H3K4me1_aln <- lapply(H3K4me1_bamFiles, readGAlignments)
H3K4me1_counts <- lapply(H3K4me1_aln, GenomicRanges::countOverlaps, query=peakSet)
names(H3K4me1_counts) <- gsub(".nodup.bam","",basename(H3K4me1_bamFiles))
H3K4me1_df=data.frame(as.list(H3K4me1_counts),stringsAsFactors=FALSE)
H3K4me1_rowsum=rowSums (H3K4me1_df, na.rm = FALSE)
H3K4me1_maxlevel=apply(H3K4me1_df, 1, max)
H3K4me1_rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),H3K4me1_df,RowSum=H3K4me1_rowsum, Max=H3K4me1_maxlevel)
write.table(H3K4me1_rawcount,"H3K4me1.rawcount.csv",row.names=FALSE,quote=FALSE,sep="\t")

### Counts per million 
H3K4me1_fac=1000000/ lengths(H3K4me1_aln, use.names = TRUE)
H3K4me1_cpm=round(t(t(H3K4me1_df)*H3K4me1_fac),6)
H3K4me1_cpm_rowsum=rowSums (H3K4me1_cpm, na.rm = FALSE)
H3K4me1_maxlevel.cpm=apply(H3K4me1_cpm, 1, max)
H3K4me1_cpm_count=data.frame(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),H3K4me1_cpm,RowSum=H3K4me1_cpm_rowsum,Max=H3K4me1_maxlevel.cpm)
write.table(H3K4me1_cpm_count,"H3K4me1.cpm.csv",row.names=FALSE,quote=FALSE,sep="\t")


#######################################################################
# H3K4me3 peakset
#######################################################################
peakFile=read.delim("./H3K4me3.bed", header = FALSE, sep ="\t")
peaks=GRanges(seqnames=peakFile$V1, IRanges(peakFile$V2,peakFile$V3),tagcount=peakFile$V5,peakName=peakFile$V7)
peakSet=keepStandardChromosomes(trim(resize(peaks, width=1000+width(peaks), fix="center"))) #39819 peaks
#mergedPeakSet=reduce(peakSet) #37643 peaks

#######################################################################
#  Map H3K4me3 read counts on H3K4me3 peaks
#######################################################################
H3K4me3_bamFilePath="/serenity/data/HAYUN/ChIP-seq/catalog/bamfiles/H3K4me3"
H3K4me3_bamFiles = dir(H3K4me3_bamFilePath, pattern="*.bam$", full.name=T)
H3K4me3_aln <- lapply(H3K4me3_bamFiles, readGAlignments)
H3K4me3_counts <- lapply(H3K4me3_aln, GenomicRanges::countOverlaps, query=peakSet)
names(H3K4me3_counts) <- gsub(".nodup.bam","",basename(H3K4me3_bamFiles))
H3K4me3_df=data.frame(as.list(H3K4me3_counts),stringsAsFactors=FALSE)
H3K4me3_rowsum=rowSums (H3K4me3_df, na.rm = FALSE)
H3K4me3_maxlevel=apply(H3K4me3_df, 1, max)
H3K4me3_rawcount=cbind(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),H3K4me3_df,RowSum=H3K4me3_rowsum, Max=H3K4me3_maxlevel)
write.table(H3K4me3_rawcount,"H3K4me3.rawcount.csv",row.names=FALSE,quote=FALSE,sep="\t")

### Counts per million
H3K4me3_fac=1000000/ lengths(H3K4me3_aln, use.names = TRUE)
H3K4me3_cpm=round(t(t(H3K4me3_df)*H3K4me3_fac),6)
H3K4me3_cpm_rowsum=rowSums (H3K4me3_cpm, na.rm = FALSE)
H3K4me3_maxlevel.cpm=apply(H3K4me3_cpm, 1, max)
H3K4me3_cpm_count=data.frame(chr=seqnames(peakSet),start=start(ranges(peakSet)),end=end(ranges(peakSet)),H3K4me3_cpm,RowSum=H3K4me3_cpm_rowsum,Max=H3K4me3_maxlevel.cpm)
write.table(H3K4me3_cpm_count,"H3K4me3.cpm.csv",row.names=FALSE,quote=FALSE,sep="\t")


############################################################################
# Defining enhancer based on the overlap between H3K4me3 and H3K4m1 peaks
############################################################################
H3K4me3peakFile=read.delim("../H3K4me3/based_on_HOMER/H3K4me3.cpm.csv", header = TRUE, sep ="\t")
H3K4me3peaks=GRanges(seqnames=H3K4me3peakFile$chr, IRanges(H3K4me3peakFile$start,H3K4me3peakFile$end),maxlevel=H3K4me3peakFile$Max)
H3K4me3peaks_gt16=H3K4me3peaks[H3K4me3peaks$maxlevel>16,]
enhancers=peakSet[!peakSet%over%H3K4me3peaks_gt16] #123231 enhancerPeaks
df.enhancer=data.frame(enhancers,stringsAsFactors=FALSE)
write.table(df.enhancer,"enhancerPeaks.bed",row.names=FALSE,quote=FALSE,sep="\t")
df.enhancer_merged=data.frame(reduce(enhancers),stringsAsFactors=FALSE) #98365 enhancerPeaks.merged
write.table(df.enhancer_merged,"enhancerPeaks.merged.bed",row.names=FALSE,quote=FALSE,sep="\t")
 No newline at end of file
+0 −40
Original line number Diff line number Diff line
#!/usr/bin/env python

##############################################################################
# Compare the distance between the centers of consecutive genomic intervals
# It takes the sorted file after concatenation of peaks from two replicates
# and find the distance between the two centers. If the distance between 
# centers is less than 500,it keep the one with higher signal 
# else keep both the intervals
##############################################################################
import sys

with open(sys.argv[1]) as infile:
	line=next(infile)
	line1=line.strip("\n").split("\t")
	for lines in infile:
		line2=lines.strip("\n").split("\t")
		#print line1
		#print line2
		if line1[0]==line2[0] and (int(line2[1])+500)-(int(line1[1])+500) <= 500:
			if float(line1[4]) >= float(line2[4]):
				pass
			else:   
				line1=line2
		else:
			print "\t".join(line1)
			line1=line2
	print "\t".join(line1)
"""
with open(sys.argv[1]) as infile:
	for lines in infile:
		line2=next(infile)
		line1=lines.strip("\n").split("\t")
		line2=line2.strip("\n").split("\t")
		if float(line1[4]) >= float(line2[4]):
			print "\t".join(line1)
		else:   
			print "\t".join(line2)
	
		#raw_input("Press Enter to continue...")
"""

ChIP-seq/filter_peaks.sh

deleted100644 → 0
+0 −157
Original line number Diff line number Diff line

### DM ###

mergePeaks -d 500 DM.H3K4me1.R1.peaks DM.H3K4me1.R2.peaks -prefix mergePeaks
grep -v \# mergePeaks_DM.H3K4me1.R1.peaks_DM.H3K4me1.R2.peaks | awk '{print $9}'|sort >  DM.H3K4me1.R1.indices
./filterpeaks_based_on_index.py DM.H3K4me1.R1.indices DM.H3K4me1.R1.peaks  > DM.H3K4me1.R1.peaks.overlap
grep -v \# mergePeaks_DM.H3K4me1.R1.peaks_DM.H3K4me1.R2.peaks | awk '{print $10}'|sort >  DM.H3K4me1.R2.indices
./filterpeaks_based_on_index.py DM.H3K4me1.R2.indices DM.H3K4me1.R2.peaks  > DM.H3K4me1.R2.peaks.overlap
grep -v \# DM.H3K4me1.R1.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"DM.H3K4me1.R1.peaks"}' > DM.H3K4me1.R1.bed 
grep -v \# DM.H3K4me1.R2.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"DM.H3K4me1.R2.peaks"}' > DM.H3K4me1.R2.bed
cat DM.H3K4me1.R1.bed DM.H3K4me1.R2.bed | sort -k1,1 -k2,2n >  DM.H3K4me1.R1_DM.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py DM.H3K4me1.R1_DM.H3K4me1.R2.sorted.bed > DM.H3K4me1.bed

### LN ###
mergePeaks -d 500 LN.H3K4me1.R1.peaks LN.H3K4me1.R2.peaks -prefix mergePeaks
grep -v \# mergePeaks_LN.H3K4me1.R1.peaks_LN.H3K4me1.R2.peaks | awk '{print $9}'|sort >  LN.H3K4me1.R1.indices
./filterpeaks_based_on_index.py LN.H3K4me1.R1.indices LN.H3K4me1.R1.peaks  > LN.H3K4me1.R1.peaks.overlap
grep -v \# mergePeaks_LN.H3K4me1.R1.peaks_LN.H3K4me1.R2.peaks | awk '{print $10}'|sort >  LN.H3K4me1.R2.indices
./filterpeaks_based_on_index.py LN.H3K4me1.R2.indices LN.H3K4me1.R2.peaks  > LN.H3K4me1.R2.peaks.overlap
grep -v \# LN.H3K4me1.R1.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"LN.H3K4me1.R1.peaks"}' > LN.H3K4me1.R1.bed 
grep -v \# LN.H3K4me1.R2.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"LN.H3K4me1.R2.peaks"}' > LN.H3K4me1.R2.bed
cat LN.H3K4me1.R1.bed LN.H3K4me1.R2.bed | sort -k1,1 -k2,2n >  LN.H3K4me1.R1_LN.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py LN.H3K4me1.R1_LN.H3K4me1.R2.sorted.bed > LN.H3K4me1.bed

### NPM1 ###
mergePeaks -d 500 NPM1.H3K4me1.R1.peaks NPM1.H3K4me1.R2.peaks -prefix mergePeaks
grep -v \# mergePeaks_NPM1.H3K4me1.R1.peaks_NPM1.H3K4me1.R2.peaks | awk '{print $9}'|sort >  NPM1.H3K4me1.R1.indices
./filterpeaks_based_on_index.py NPM1.H3K4me1.R1.indices NPM1.H3K4me1.R1.peaks  > NPM1.H3K4me1.R1.peaks.overlap
grep -v \# mergePeaks_NPM1.H3K4me1.R1.peaks_NPM1.H3K4me1.R2.peaks | awk '{print $10}'|sort >  NPM1.H3K4me1.R2.indices
./filterpeaks_based_on_index.py NPM1.H3K4me1.R2.indices NPM1.H3K4me1.R2.peaks  > NPM1.H3K4me1.R2.peaks.overlap
grep -v \# NPM1.H3K4me1.R1.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"NPM1.H3K4me1.R1.peaks"}' > NPM1.H3K4me1.R1.bed 
grep -v \# NPM1.H3K4me1.R2.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"NPM1.H3K4me1.R2.peaks"}' > NPM1.H3K4me1.R2.bed
cat NPM1.H3K4me1.R1.bed NPM1.H3K4me1.R2.bed | sort -k1,1 -k2,2n >  NPM1.H3K4me1.R1_NPM1.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py NPM1.H3K4me1.R1_NPM1.H3K4me1.R2.sorted.bed > NPM1.H3K4me1.bed

### FLT3 ###
mergePeaks -d 500 FLT3.H3K4me1.R1.peaks FLT3.H3K4me1.R2.peaks -prefix mergePeaks
grep -v \# mergePeaks_FLT3.H3K4me1.R1.peaks_FLT3.H3K4me1.R2.peaks | awk '{print $9}'|sort >  FLT3.H3K4me1.R1.indices
./filterpeaks_based_on_index.py FLT3.H3K4me1.R1.indices FLT3.H3K4me1.R1.peaks  > FLT3.H3K4me1.R1.peaks.overlap
grep -v \# mergePeaks_FLT3.H3K4me1.R1.peaks_FLT3.H3K4me1.R2.peaks | awk '{print $10}'|sort >  FLT3.H3K4me1.R2.indices
./filterpeaks_based_on_index.py FLT3.H3K4me1.R2.indices FLT3.H3K4me1.R2.peaks  > FLT3.H3K4me1.R2.peaks.overlap
grep -v \# FLT3.H3K4me1.R1.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"FLT3.H3K4me1.R1.peaks"}' > FLT3.H3K4me1.R1.bed 
grep -v \# FLT3.H3K4me1.R2.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"FLT3.H3K4me1.R2.peaks"}' > FLT3.H3K4me1.R2.bed
cat FLT3.H3K4me1.R1.bed FLT3.H3K4me1.R2.bed | sort -k1,1 -k2,2n >  FLT3.H3K4me1.R1_FLT3.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py FLT3.H3K4me1.R1_FLT3.H3K4me1.R2.sorted.bed > FLT3.H3K4me1.bed

###concatenate all the bed files
cat DM.H3K4me1.bed  FLT3.H3K4me1.bed  LN.H3K4me1.bed  NPM1.H3K4me1.bed | sort -k1,1 -k2,2n >H3K4me1.allpeaks.bed

### get the catalog
./get_catalog.py H3K4me1.allpeaks.bed > H3K4me1.bed

### NE #
mergePeaks -d 500 NE.H3K4me1.R1.peaks NE.H3K4me1.R2.peaks -prefix mergePeaks
grep -v \# mergePeaks_NE.H3K4me1.R1.peaks_NE.H3K4me1.R2.peaks | awk '{print $9}'|sort >  NE.H3K4me1.R1.indices
./filterpeaks_based_on_index.py NE.H3K4me1.R1.indices NE.H3K4me1.R1.peaks  > NE.H3K4me1.R1.peaks.overlap
grep -v \# mergePeaks_NE.H3K4me1.R1.peaks_NE.H3K4me1.R2.peaks | awk '{print $10}'|sort >  NE.H3K4me1.R2.indices
./filterpeaks_based_on_index.py NE.H3K4me1.R2.indices NE.H3K4me1.R2.peaks  > NE.H3K4me1.R2.peaks.overlap
grep -v \# NE.H3K4me1.R1.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"NE.H3K4me1.R1.peaks"}' > NE.H3K4me1.R1.bed 
grep -v \# NE.H3K4me1.R2.peaks.overlap |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8,"NE.H3K4me1.R2.peaks"}' > NE.H3K4me1.R2.bed
cat NE.H3K4me1.R1.bed NE.H3K4me1.R2.bed | sort -k1,1 -k2,2n >  NE.H3K4me1.R1_NE.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py NE.H3K4me1.R1_NE.H3K4me1.R2.sorted.bed > NE.H3K4me1.bed

cat LN.H3K4me1.bed  NE.H3K4me1.bed | sort -k1,1 -k2,2n >H3K4me1.LN_NEpeaks.bed
./get_catalog.py H3K4me1.LN_NEpeaks.bed > H3K4me1.LN_NE.bed
#########################################
# OLD COMMANDS
#########################################
# Find overlapping peaks
mergePeaks -d 500 DM.H3K4me1.R1.peaks DM.H3K4me1.R2.peaks -prefix mergePeaks
mergePeaks -d 500 LN.H3K4me1.R1.peaks LN.H3K4me1.R2.peaks -prefix mergePeaks
mergePeaks -d 500 NPM1.H3K4me1.R1.peaks NPM1.H3K4me1.R2.peaks -prefix mergePeaks
mergePeaks -d 500 NE.H3K4me1.R1.peaks NE.H3K4me1.R2.peaks -prefix mergePeaks
mergePeaks -d 500 FLT3.H3K4me1.R1.peaks FLT3.H3K4me1.R2.peaks -prefix mergePeaks
# Extract the genomic ranges from the overlap file
awk '{OFS="\t"};{print $2,$3,$4}' DM.H3K4me1_DM.H3K4me1.R1.peaks_DM.H3K4me1.R2.peaks >DM.H3K4me1.R1_DM.H3K4me1.R2_overlap.bed && sed -i '1,+0d' DM.H3K4me1.R1_DM.H3K4me1.R2_overlap.bed

# convert homer peak file into bed file
grep -v \# DM.H3K4me1.R1.peaks |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8}' > DM.H3K4me1.R1.bed 
grep -v \# DM.H3K4me1.R2.peaks |awk '{OFS="\t"};{print $2,$3,$4,$1,$6,$8}' > DM.H3K4me1.R2.bed

# Filter the overlapped peaks from each replicates
bedtools intersect -a  DM.H3K4me1.R1.bed -b DM.H3K4me1.R1_DM.H3K4me1.R2_overlap.bed -wa > DM.H3K4me1.R1.overlapped.bed
bedtools intersect -a  DM.H3K4me1.R2.bed -b DM.H3K4me1.R1_DM.H3K4me1.R2_overlap.bed -wa > DM.H3K4me1.R2.overlapped.bed

# concatenate both the files with overlapped peak ranges and sort it
cat DM.H3K4me1.R1.overlapped.bed DM.H3K4me1.R2.overlapped.bed | sort -k1,1 -k2,2n >  DM.H3K4me1.R1.R2.overlap.bed

# Filter the oerlapped peaks to keep the one with stronger signal
./filter_overlapped_intervals.py DM.H3K4me1.R1.R2.overlap.bed > DM.H3K4me1.bed
#########
# DM
#########
awk '{OFS="\t"};{print $2,$3,$4}' mergePeaks_DM.H3K4me1.R1.peaks_DM.H3K4me1.R2.peaks >mergePeaks_DM.H3K4me1.R1.peaks_DM.H3K4me1.R2.peaks.bed && sed -i '1,+0d' mergePeaks_DM.H3K4me1.R1.peaks_DM.H3K4me1.R2.peaks.bed
grep -v \# DM.H3K4me1.R1.peaks |awk '{OFS="\t"};{print $2,$3,$4,"DM.H3K4me1.R1.peaks",$6,$8}' > DM.H3K4me1.R1.bed 
grep -v \# DM.H3K4me1.R2.peaks |awk '{OFS="\t"};{print $2,$3,$4,"DM.H3K4me1.R2.peaks",$6,$8}' > DM.H3K4me1.R2.bed
bedtools intersect -a  DM.H3K4me1.R1.bed -b mergePeaks_DM.H3K4me1.R1.peaks_DM.H3K4me1.R2.peaks.bed -wa > DM.H3K4me1.R1.overlapped.bed
bedtools intersect -a  DM.H3K4me1.R2.bed -b mergePeaks_DM.H3K4me1.R1.peaks_DM.H3K4me1.R2.peaks.bed -wa > DM.H3K4me1.R2.overlapped.bed
cat DM.H3K4me1.R1.overlapped.bed DM.H3K4me1.R2.overlapped.bed | sort -k1,1 -k2,2n >  DM.H3K4me1.R1_DM.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py DM.H3K4me1.R1_DM.H3K4me1.R2.sorted.bed > DM.H3K4me1.bed

#########
# LN
#########
awk '{OFS="\t"};{print $2,$3,$4}' mergePeaks_LN.H3K4me1.R1.peaks_LN.H3K4me1.R2.peaks >mergePeaks_LN.H3K4me1.R1.peaks_LN.H3K4me1.R2.peaks.bed && sed -i '1,+0d' mergePeaks_LN.H3K4me1.R1.peaks_LN.H3K4me1.R2.peaks.bed
grep -v \# LN.H3K4me1.R1.peaks |awk '{OFS="\t"};{print $2,$3,$4,"LN.H3K4me1.R1.peaks",$6,$8}' > LN.H3K4me1.R1.bed 
grep -v \# LN.H3K4me1.R2.peaks |awk '{OFS="\t"};{print $2,$3,$4,"LN.H3K4me1.R2.peaks",$6,$8}' > LN.H3K4me1.R2.bed
bedtools intersect -a  LN.H3K4me1.R1.bed -b mergePeaks_LN.H3K4me1.R1.peaks_LN.H3K4me1.R2.peaks.bed -wa > LN.H3K4me1.R1.overlapped.bed
bedtools intersect -a  LN.H3K4me1.R2.bed -b mergePeaks_LN.H3K4me1.R1.peaks_LN.H3K4me1.R2.peaks.bed -wa > LN.H3K4me1.R2.overlapped.bed
cat LN.H3K4me1.R1.overlapped.bed LN.H3K4me1.R2.overlapped.bed | sort -k1,1 -k2,2n >  LN.H3K4me1.R1_LN.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py LN.H3K4me1.R1_LN.H3K4me1.R2.sorted.bed > LN.H3K4me1.bed

#########
# NPM1
#########
awk '{OFS="\t"};{print $2,$3,$4}' mergePeaks_NPM1.H3K4me1.R1.peaks_NPM1.H3K4me1.R2.peaks >mergePeaks_NPM1.H3K4me1.R1.peaks_NPM1.H3K4me1.R2.peaks.bed && sed -i '1,+0d' mergePeaks_NPM1.H3K4me1.R1.peaks_NPM1.H3K4me1.R2.peaks.bed
grep -v \# NPM1.H3K4me1.R1.peaks |awk '{OFS="\t"};{print $2,$3,$4,"NPM1.H3K4me1.R1.peaks",$6,$8}' > NPM1.H3K4me1.R1.bed 
grep -v \# NPM1.H3K4me1.R2.peaks |awk '{OFS="\t"};{print $2,$3,$4,"NPM1.H3K4me1.R2.peaks",$6,$8}' > NPM1.H3K4me1.R2.bed
bedtools intersect -a  NPM1.H3K4me1.R1.bed -b mergePeaks_NPM1.H3K4me1.R1.peaks_NPM1.H3K4me1.R2.peaks.bed -wa > NPM1.H3K4me1.R1.overlapped.bed
bedtools intersect -a  NPM1.H3K4me1.R2.bed -b mergePeaks_NPM1.H3K4me1.R1.peaks_NPM1.H3K4me1.R2.peaks.bed -wa > NPM1.H3K4me1.R2.overlapped.bed
cat NPM1.H3K4me1.R1.overlapped.bed NPM1.H3K4me1.R2.overlapped.bed | sort -k1,1 -k2,2n >  NPM1.H3K4me1.R1_NPM1.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py NPM1.H3K4me1.R1_NPM1.H3K4me1.R2.sorted.bed > NPM1.H3K4me1.bed

#########
# FLT3
#########
awk '{OFS="\t"};{print $2,$3,$4}' mergePeaks_FLT3.H3K4me1.R1.peaks_FLT3.H3K4me1.R2.peaks >mergePeaks_FLT3.H3K4me1.R1.peaks_FLT3.H3K4me1.R2.peaks.bed && sed -i '1,+0d' mergePeaks_FLT3.H3K4me1.R1.peaks_FLT3.H3K4me1.R2.peaks.bed
grep -v \# FLT3.H3K4me1.R1.peaks |awk '{OFS="\t"};{print $2,$3,$4,"FLT3.H3K4me1.R1.peaks",$6,$8}' > FLT3.H3K4me1.R1.bed 
grep -v \# FLT3.H3K4me1.R2.peaks |awk '{OFS="\t"};{print $2,$3,$4,"FLT3.H3K4me1.R2.peaks",$6,$8}' > FLT3.H3K4me1.R2.bed
bedtools intersect -a  FLT3.H3K4me1.R1.bed -b mergePeaks_FLT3.H3K4me1.R1.peaks_FLT3.H3K4me1.R2.peaks.bed -wa > FLT3.H3K4me1.R1.overlapped.bed
bedtools intersect -a  FLT3.H3K4me1.R2.bed -b mergePeaks_FLT3.H3K4me1.R1.peaks_FLT3.H3K4me1.R2.peaks.bed -wa > FLT3.H3K4me1.R2.overlapped.bed
cat FLT3.H3K4me1.R1.overlapped.bed FLT3.H3K4me1.R2.overlapped.bed | sort -k1,1 -k2,2n >  FLT3.H3K4me1.R1_FLT3.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py FLT3.H3K4me1.R1_FLT3.H3K4me1.R2.sorted.bed > FLT3.H3K4me1.bed

###concatenate all the bed files
cat DM.H3K4me1.bed  FLT3.H3K4me1.bed  LN.H3K4me1.bed  NPM1.H3K4me1.bed | sort -k1,1 -k2,2n >H3K4me1.allpeaks.bed

### get the catalog
./get_catalog.py H3K4me1.allpeaks.bed > H3K4me1.bed
#########
# NE
#########
awk '{OFS="\t"};{print $2,$3,$4}' mergePeaks_NE.H3K4me1.R1.peaks_NE.H3K4me1.R2.peaks >mergePeaks_NE.H3K4me1.R1.peaks_NE.H3K4me1.R2.peaks.bed && sed -i '1,+0d' mergePeaks_NE.H3K4me1.R1.peaks_NE.H3K4me1.R2.peaks.bed
grep -v \# NE.H3K4me1.R1.peaks |awk '{OFS="\t"};{print $2,$3,$4,"NE.H3K4me1.R1.peaks",$6,$8}' > NE.H3K4me1.R1.bed 
grep -v \# NE.H3K4me1.R2.peaks |awk '{OFS="\t"};{print $2,$3,$4,"NE.H3K4me1.R2.peaks",$6,$8}' > NE.H3K4me1.R2.bed
bedtools intersect -a  NE.H3K4me1.R1.bed -b mergePeaks_NE.H3K4me1.R1.peaks_NE.H3K4me1.R2.peaks.bed -wa > NE.H3K4me1.R1.overlapped.bed
bedtools intersect -a  NE.H3K4me1.R2.bed -b mergePeaks_NE.H3K4me1.R1.peaks_NE.H3K4me1.R2.peaks.bed -wa > NE.H3K4me1.R2.overlapped.bed
cat NE.H3K4me1.R1.overlapped.bed NE.H3K4me1.R2.overlapped.bed | sort -k1,1 -k2,2n >  NE.H3K4me1.R1_NE.H3K4me1.R2.sorted.bed
./filter_overlapped_intervals.py NE.H3K4me1.R1_NE.H3K4me1.R2.sorted.bed > NE.H3K4me1.bed

###concatenate all the bed files with NE
cat ../DM.H3K4me1.bed  ../FLT3.H3K4me1.bed  ../LN.H3K4me1.bed  ../NPM1.H3K4me1.bed  ../NE.H3K4me1.bed | sort -k1,1 -k2,2n >H3K4me1.allpeaks.bed

### get the catalog
../get_catalog.py H3K4me1.allpeaks.bed > H3K4me1.bed

+0 −19
Original line number Diff line number Diff line
#!/usr/bin/env python

############################################################################################################
# filter peaks that overlap between replicates from each parent file based on indices from mergePeak 
# filterpeaks_based_on_index.py DM.H3K4me1.R1.indices DM.H3K4me1.R1.peaks  > DM.H3K4me1.R1.peaks.overlap
# Indices are retrieve using following command
# grep -v \# mergePeaks_DM.H3K4me1.R1.peaks_DM.H3K4me1.R2.peaks | awk '{print $9}'|sort >  DM.H3K4me1.R1.indices
# DM.H3K4me1.R1.peaks -> from mergePeaks
############################################################################################################

import sys
index_list=[line.strip("\n") for line in open(sys.argv[1])]

with open(sys.argv[2]) as infile:
	for lines in infile:
		if not lines.startswith("#"):
			line=lines.strip("\n").split("\t")
			if line[0] in index_list:
				print lines.strip("\n")
Loading