Unverified Commit a716005d authored by clintmil's avatar clintmil Committed by GitHub
Browse files

Merge branch 'Tarela:main' into main

parents a78a9653 ee08024f
Loading
Loading
Loading
Loading
+136 −0
Original line number Diff line number Diff line
#!/usr/bin/env python2.7
import regions as R
from multiprocessing import Pool
import subprocess
import sys,os
import numpy as np
import math
import argparse
import time

def extractBigwig(cmd,bins):
    result = os.popen(cmd)
    content = result.readlines()

    if content:
        temp = content[0].strip().replace('n/a','0')
        temp = temp.split('\t')
        return temp
    else:
        temp=[0]*int(bins)
        return temp

def getInfo(genome, name, alpha, bwfilename, tss, chromesize, bwsum):
    
    Infos = []
    chromeinfo = {}
    inf = open(chromesize)
    for line in inf:
        line = line.strip().split('\t')
        chrome = line[0]
        size = line[1]
        chromeinfo[chrome] = size

    G = R.interval(genome=genome)
    G.read_bed(tss)
    
    padding = int(1e5)
  
    weight  = np.array( [ 2.0*math.exp(-alpha*math.fabs(z)/1e5)/(1.0+math.exp(-alpha*math.fabs(z)/1e5))  for z in range( -padding,padding+1) ] )
    #print weight
    #print np.sum(weight)    
    #chrom_p = None
    for i,chrom in enumerate(G.chrom):
        center = G.start[i]
        chrom = G.chrom[i]
        bwstart = center - padding
        bwend = center + padding
        if bwstart < 0:
            bwstart = 0
        if bwend > int(chromeinfo[G.chrom[i]]):
            bwend = int(chromeinfo[G.chrom[i]])
        bins = bwend - bwstart + 1
        #print np.shape(weight)
        Infos.append([bwfilename,chrom,bwstart,bwend,bins,weight,name,G.start[i],G.end[i], G.name[i].split(':')[0], G.name[i].split(':')[1],G.strand[i], bwsum])
    #print Infos
    return Infos

def getRP(Info):
    bwfilename = Info[0]
    chrom = Info[1]
    bwstart = Info[2]
    bwend = Info[3]
    bins = Info[4]
    weight = Info[5]
    name = Info[6]
    Gstart = Info[7]
    Gend = Info[8]
    Gname1 = Info[9]
    Gname2 = Info[10]
    Gstrand = Info[11]

    bwsum = Info[12]

    padding = int(1e5)
    center = Gstart
    cmd = '%s {0} {1} {2} {3} {4}'%bwsum
    try:
        cmd = cmd.format(bwfilename,chrom,bwstart,bwend + 1, bins)
        #print cmd
        values = extractBigwig(cmd,bins)
        #print len(values)
        values = np.array(values,dtype=np.float64)
   
        values2 = np.hstack((np.zeros(bwstart - center + padding),values,np.zeros(center + padding - bwend )))
        invalid = np.isnan( values2 )
        values2[ invalid ]   = 0
        #print np.sum(values2)

        return chrom + '\t' + str(Gstart) + '\t' + str(Gend) + '\t' + Gname1 + '\t' + str(np.dot( values2, weight ))  + '\t' + Gname2 + '\t' + Gstrand + '\n'
    except:
        return chrom + '\t' + str(Gstart) + '\t' + str(Gend) + '\t' + Gname1 + '\t' + str(0)  + '\t' + Gname2 + '\t' + Gstrand + '\n'


if __name__ == "__main__":
    start = time.time()
    results = []
    try:
        parser = argparse.ArgumentParser(description="""Map TSS to the nearest regions.""")
        parser.add_argument( '-g', dest='genome', default='hg38', choices=['mm9','hg19','mm10','hg38'], required=False, help='genome' )
        parser.add_argument( '-n', dest='name',   type=str, required=True, help='name to associate with input bed file' )
        parser.add_argument( '-a', dest='alpha',  type=float, default=1e4, required=False, help='effect of distance on regulatory potential. Distance at which regulatory potential is 1/2, (default=10kb)' )
        parser.add_argument( '-b', dest='bw',     type=str, required=True, help='Bigwig file name' )
        parser.add_argument( '--cs', dest='chromesize',  type=str, required=False, default="/nv/vol190/zanglab/sh8tv/Data/Genome/hg38/chromInfo_hg38.txt", help='chrom.sizes is two columns: <chromosome name> <size in bases>' )
#        parser.add_argument( '--tss', dest='refseqTSS',  type=str, required=False, default="/scratch/sh8tv/Project/regulation_network/Data/annotation/hg38_uniq_symbol_withExonArrayExp_TSS.bed", help='refseqTSS is six columns: <chromosome name> <TSS> <TSS+1> <refseq:genesymbok> <score> <strand>' )
#        parser.add_argument( '--tss', dest='refseqTSS',  type=str, required=False, default="/nv/vol190/zanglab/sh8tv/Data/refgenes/hg38/hg38_refseq_clean_tss_forRP.bed", help='refseqTSS is six columns: <chromosome name> <TSS> <TSS+1> <refseq:genesymbok> <score> <strand>' )
        parser.add_argument( '--tss', dest='refseqTSS',  type=str, required=False, default="/nv/vol190/zanglab/sh8tv/Data/refgenes/geneID_annotation/hg38_gene_annotation_geneID_LenOrder_TSS_forRP.bed", help='refseqTSS is six columns: <chromosome name> <TSS> <TSS+1> <refseq:genesymbok> <score> <strand>' )
        parser.add_argument( '--thread', dest='threads',  type=int, required=False, default=8, help='Number of threads to calcuate the Regulatory Potential, DEFAULT=8' )
        parser.add_argument( '--bwsum', dest='bwsum',  type=str, required=False, help='Path fot the bigWigSummary script', default='/nv/vol190/zanglab/sh8tv/bin/UCSCTools/bigWigSummary' )
        args = parser.parse_args()
        #chromeinfo = chromesizeinfo(args.chromesize)
        output = args.name
        outf = open(output,'w+')
        alpha = -math.log(1.0/3.0)*1e5/args.alpha
        Infos = getInfo(args.genome, args.name, alpha, args.bw, args.refseqTSS, args.chromesize, args.bwsum)
        #print Infos[0]
        p = Pool(args.threads)
        result = p.map_async(getRP, Infos, callback=results.append)
        p.close()
        p.join()
       # print results
        for line in results:
            for element in line:
                outf.write(element)

    except KeyboardInterrupt:
        sys.stderr.write("User interrunpt me! ;-) Bye!\n")
        sys.exit(0)
    outf.close()       
    end = time.time()
    total = end - start
    hour = int(total/3600)
    minite = int(total - hour*3600)/60
    second = int(total - hour*3600 - minite*60)
    print 'total time: %s:%s:%s'%(hour, minite, second)

+0 −0

Empty file added.

+40 −0
Original line number Diff line number Diff line
#### super enhancer data analysis pipeline

# requirements: bowtie2 (v2.3.5.1), samtools (1.12), bedtools (2.29.2), sicer, macs2 (v2.1.2), uscstools (v4)
# hg38.chrom.sizes can be downloaded from UCSC table browser
# hg38_bowtie2_index can be generated by bowtie2 build function
 
# H3K27ac data pre-process
bowtie2 -p 8 -X 2000 -x hg38_bowtie2_index -1 H3K27ac_p1.fastq -2 H3K27ac_p2.fastq -S H3K27ac.sam 2>&1 >>/dev/null | tee H3K27ac_bowtie2PE.out
samtools view -bS H3K27ac.sam > H3K27ac_raw.bam
samtools view -H H3K27ac_raw.bam > H3K27ac_filtered.sam
samtools view -f 0x2 H3K27ac_raw.bam | awk 'NR % 2 == 1{mapq=$5;forward=$0} NR % 2 == 0{if($5>=30 && mapq>=30 && substr($3,1,3)=="chr" && $7=="=" ) print forward"\n"$0}' >> H3K27ac_filtered.sam
samtools view -bS H3K27ac_filtered.sam > H3K27ac.bam
bamToBed -i H3K27ac.bam -bedpe | awk '{OFS="\t"; print $1,$2,$6,$7,$8,"."}' | sort -k 1,1 -k 2,2g -k 3,3g|cut -f 1-3 |uniq > H3K27ac_PEuniq.bed

# super enhancer detection
sicer -t H3K27ac.bam -c HCASMC_IgG.bam -s hg38 -w 200 -rt 1 -f 150 -egf 0.74 -fdr 0.01 -g 600 -e 1000
mv H3K27ac-W200-G600-FDR0.01-island.bed H3K27ac_sicerPeak.bed
awk '{OFS="\t";if($3-$2>=10000) print $0}' H3K27ac_sicerPeak.bed > H3K27ac_sicer10kb.bed

# fetch reads on super enhancer region
intersectBed -a H3K27ac_PEuniq.bed  -b ../../H3K27ac_nonPromoter.bed -u > H3K27ac_readsOnLongNPpeaks.bed
macs2 pileup -i H3K27ac_readsOnLongNPpeaks.bed --extsize 1 -f BEDPE -o H3K27ac_readsOnLongNPpeaks_cuts.bdg
bedClip H3K27ac_readsOnLongNPpeaks_cuts.bdg hg38.chrom.sizes H3K27ac_readsOnLongNPpeaks_cuts.bdg.clip
sort -k1,1 -k2,2n H3K27ac_readsOnLongNPpeaks_cuts.bdg.clip > H3K27ac_readsOnLongNPpeaks_cuts.bdg.clip.sort
bedGraphToBigWig H3K27ac_readsOnLongNPpeaks_cuts.bdg.clip.sort hg38.chrom.sizes H3K27ac_readsOnLongNPpeaks_cuts.bw

bdg2bw H3K27ac_readsOnLongNPpeaks_cuts.bdg /nv/vol190/zanglab/sh8tv/Data/Genome/hg38/hg38_clean.len

# calculate regulatory potential of each gene from reads on super enhancer region
python RPCalc.py -n H3K27ac_readsOnLongNPpeaks_RP.bed -b H3K27ac_readsOnLongNPpeaks_cuts.bw


#### bulk ATAC-seq data analysis pipeline (PEPATAC)
# requirements: pepatac (v0.10.3), singularity (v3.1.1)
# pepatac.py, hg38_TSS.tsv, rCRSd, human_repeats, and hg38.blacklist.bed can all be downloaded from PEPATAC website (https://github.com/databio/pepatac)

# bulk ATAC-seq data process
singularity exec --bind hg38 pepatac pepatac.py -O PEPATAC_output -Q paired -C pepatac.yaml -gs hs -S bulkATAC -I bulkATAC_p1.fq.gz -I2 bulkATAC_p2.fq.gz -G hg38 --TSS-name hg38_TSS.tsv --prealignments rCRSd human_repeats --blacklist hg38.blacklist.bed 

+63 −0
Original line number Diff line number Diff line
# PEPATAC configuration file for an ATACseq pipeline based on pypiper

# basic tools 
# public tools
tools:  # absolute paths to required tools
  java: java
  python: python
  samtools: samtools
  bedtools: bedtools
  bowtie2: bowtie2
  fastqc: fastqc
  macs2: macs2
  samblaster: samblaster
  skewer: skewer
  perl: perl
  # ucsc tools
  bedGraphToBigWig: bedGraphToBigWig
  wigToBigWig: wigToBigWig
  bigWigCat: bigWigCat
  bedToBigBed: bedToBigBed
  # optional tools
  fseq: fseq  
  trimmo: ${TRIMMOMATIC}
  picard: ${PICARD}
  Rscript: Rscript 

# user configure 
resources:
  genomes: ${GENOMES}
  adapters: null  # Set to null to use default adapter file included in repository

parameters:  # parameters passed to bioinformatic tools, subclassed by tool
# Adjust/Add/Remove parameters for individual tools here
  bowtie2_pre: # Modify bowtie2 prealignment settings
    params: ''
    # pipeline default: -k 1 -D 20 -R 3 -N 1 -L 20 -i S,1,0.50
    # -k 1: report up to <1> alns per read; MAPQ not meaningful
    # -D 20: give up extending after <20> failed extends in a row
    # -R 3: for reads w/ repetitive seeds, try <3> sets of seeds
    # -N 1: max # mismatches in seed alignment; can be 0 or 1
    # -L 20: length of seed substrings; must be >3, <32
    # -i S,1,0.50: interval between seed substrings w/r/t read len
  bowtie2: # Modify bowtie2 primary genome alignment settings
    params: ''
    # pipeline default: --very-sensitive -X 2000
    # --very-sensitive: -D 20 -R 3 -N 0 -L 20 -i S,1,0.50
    # -X 2000: paired-end maximum fragment length
  samtools:
    params: '-q 10'
    # -q: skip alignments with MAPQ < 10.
  macs2: 
    params: '-f BED -q 0.01 --shift -100 --extsize 200 --nomodel --keep-dup all'
    # -f: Format of tag file.
    # -q: The qvalue (minimum FDR) cutoff to call significant regions.
    # --shift:  Assign an arbitrary shift in bp. See MACS documentation.
    # --nomodel: Will bybass building the shifting model.
  fseq:
    params: '-of npf -l 600 -t 4.0 -s 1'
    # -of: narrowPeak as output format.
    # -l: feature length.
    # -t: "threshold" (standard deviations).
    # -s: wiggle track step.
    
+189 −0
Original line number Diff line number Diff line
import sys,time,os,math,struct,random,numpy
import twobitreader

import __init__
#import seqpos.seqpos as seqpos
#import re,MySQLdb,operator 
#import mystats

NONEARESTSITE = 999999999
MISSINGVAL    = -99999999
NOCLUSTER     = -1
ERRCODE  = 1
PASSCODE = 0

def maskseq( seq, masklocs ):
    """
    seq is a list of sequence strings
    masklocs is a list of 3 tuples / lists: 
        x[0] index of string to be masked
        x[1] start position to be masked
        x[2] end position + 1 
    """
    for xloc in masklocs:
        l = list( seq[xloc[0]] )
        l[xloc[1]:xloc[2]] = (xloc[2]-xloc[1])*['N']
        seq[xloc[0]] = ''.join(l)
    return


class interval(object):

    def __init__(self,genome,description=''):
        self.description = description
        self.genome = genome
        self.chrom  = None 
        self.start  = None
        self.end    = None
        self.idx    = None
        self.length = None
        self.prob   = None
        self.val    = None
        self.name   = None
        self.namelookup = None # MakeIndex makes a dictionary of names mapping to indices 
        self.mid    = None
        self.strand = None
        self.name2  = None
        self.seq    = None
        self.conservation = None
        self.motifscores  = None
        self.basecontent  = None
        self.info   = None # list of strings holding information about ChIP 

    def __str__(self):
        """
        print bed file
        """
        s=''
        for i,elem in enumerate(self.chrom):
            s += '\t'.join([self.chrom[i],str(self.start[i]),str(self.end[i])])
            if self.name and (len(self.name) == len(self.chrom)):
                s+= '\t%s' % self.name[i]
            if self.val != None: # and len(self.val) == len(self.chrom):
                s+= '\t%5.2f' % self.val[i]
            if self.strand != None: # and len() == len(self.chrom):
                s+= '\t%s' % self.strand[i]
           
            s += '\n'
        return s

    def __getitem__(self,key):
        """
        Slice interval object
        """
        new = interval(self.genome,description=self.description)

        for elem in self.__dict__:
            # copy lists 
            if type( self.__dict__[elem] ) == type([]):
                new.__dict__[elem] = self.__dict__[elem][key]
            # copy arrays
            if type( self.__dict__[elem] ) == type(numpy.zeros(1,float)):
                new.__dict__[elem] = self.__dict__[elem][key]
            # copy dictionaries
            #if type( self.__dict__[elem] ) == type({}):
            #    new.__dict__[elem] = self.__dict__[elem]

        return new


    def read_bed(self,file,scorefield=4,strandfield=5,name2field=6):
        """
        Scorefield is the column to be read as score (index starting at 0).
        """

        CHROMFIELD,STARTFIELD,ENDFIELD,NAMEFIELD = range(4)
        SCOREFIELD = scorefield
        STRANDFIELD = strandfield
        NAME2FIELD = name2field

        try:
            fp=open(file,'r')
        except IOError:
            print "%s not found." % file
            raise

        self.chrom, self.start, self.end = [],[],[]
        self.length, self.mid = [],[]
    
        fi = iter(fp)

        nameflag,scoreflag,strandflag,name2flag = False,False,False,False
        for i,line in enumerate(fi):
            if line[0:3] == 'chr':
 
                try:
                    field    = line.split()
                    chrom_t  = field[CHROMFIELD]
                    start_t  = int(field[STARTFIELD])
                    end_t    = int(field[ENDFIELD])
                    length_t = end_t-start_t
                    mid_t    = int(0.5*(end_t+start_t))
                    #strand_t = field[STRANDFIELD]
                    
                except:
                    chrom_t,start_t,end_t,length_t,mid_t = None,None,None,None,None
                    print "Error reading line of bedfile %s on line %d:\n%s" % (file,i,line)
                    continue
                    #return ERRCODE
   
                #print strand_t
                if chrom_t:
                    self.chrom  += [chrom_t]
                    self.start  += [start_t]
                    self.end    += [end_t]
                    self.length += [length_t]
                    self.mid    += [mid_t]
                    #self.strand    += [strand_t]
     
                    if len(self.chrom) == 1:
                        if len(field) > NAMEFIELD:
                            nameflag = True
                            self.name = []
                        if len(field) > SCOREFIELD:
                            scoreflag = True
                            self.val = []
                        if len(field) > STRANDFIELD and field[STRANDFIELD] in ['+','-']:
                            strandflag = True
                            self.strand = []
                        if len(field) > NAME2FIELD:
                            name2flag = True
                            self.name2 = []

                    if nameflag == True:
                        try:
                            self.name   += [field[NAMEFIELD]]
                        except IndexError:
                            print "Error reading name in bedfile %s on line %d:\n%s" % (file,i,line)
                            return ERRCODE
                    
                    if scoreflag == True:
                        try:
                            self.val    += [float(field[SCOREFIELD])]
                        except:
                            print "Error reading score in bedfile %s on line %d:\n%s" % (file,i,line)
                            return ERRCODE

                    if strandflag == True:
                        try:
                            self.strand += [field[STRANDFIELD]]
                            if self.strand[-1] not in ['-','+']:
                                print "Error reading strand in bedfile %s on line %d:\n%s" % (file,i,line)
                                print "Bed file error: strand must be + or - %s" % strand[-1]
                                return ERRCODE
                        except:
                            print "Error reading bedfile %s on line %d:\n%s" % (file,i,line)
                            return ERRCODE
                    if name2flag == True:
                        try:
                            self.name2 += [field[NAME2FIELD]]
                        except:
                            print "Error reading score in bedfile %s on line %d:\n%s" % (file,i,line)
                            return ERRCODE
                    
                        #self.strand += [field[STRANDFIELD]]

        #print "number of regions:", len(self.chrom)
        return PASSCODE

Loading