Commit febfcb6f authored by Chris Cheshire's avatar Chris Cheshire
Browse files

Updated peak reprod calculation script to better handle file diff

parent 4efc76a9
Loading
Loading
Loading
Loading
+33 −14
Original line number Diff line number Diff line
@@ -31,27 +31,44 @@ args = parser.parse_args()

# Init
peak_perc = 0
numfiles = 0
num_columns = 0

print('Reading file')

# Read file in using dask
ddf_inter = dd.read_csv(args.intersect, sep='\t', header=None, names=['chrom','start','end','overlap_1','key','a_name','b_name','count'],
    dtype={'chrom':str,'start':np.int64,'end':np.int64,'overlap_1':np.float64,'key':np.float64,'a_name':str,'b_name':str,'count':np.int64})
# Read first line
first_line = None
with open(args.intersect, "r") as file:
    for line in file:
        first_line = line
        break

if first_line is not None:
    first_line_split = first_line.split('\t')
    num_columns = len(first_line_split)
    numfiles = 1

if numfiles != 0:
    print('Number of columns: ' + str(num_columns))

# Find number of files
numfiles = ddf_inter['b_name'].max().compute()
    ddf_inter = None
    if num_columns == 6:
        # Read file in using dask
        ddf_inter = dd.read_csv(args.intersect, sep='\t', header=None, names=['chrom','start','end','key','file_num','count'],
            dtype={'chrom':str,'start':np.int64,'end':np.int64,'key':str, 'file_num':np.int32, 'count':np.int32})
        numfiles = ddf_inter['file_num'].max().compute()

# Check for table format
if isinstance(numfiles, str):
    print('Detected single file, reloading table')
    elif num_columns == 5:
        # Read file in using dask
        ddf_inter = dd.read_csv(args.intersect, sep='\t', header=None, names=['chrom','start','end','key','count'],
            dtype={'chrom':str,'start':np.int64,'end':np.int64,'key':str, 'file_num':np.int32, 'count':np.int32})
        numfiles = 1
    ddf_inter = dd.read_csv(args.intersect, sep='\t', header=None, names=['chrom','start','end','overlap_1','overlap_2','key','name','count'],
        dtype={'chrom':str,'start':np.int64,'end':np.int64,'overlap_1':np.float64,'overlap_2':np.float64,'key':str,'name':str,'count':np.int64})
    else:
        print('Invalid file format detected')
        exit(1)

    print('Number of files: ' + str(numfiles))

# Check for empty file
if numfiles != 0:
    # Find total number of peaks
    ddf_inter_grouped = ddf_inter.groupby(by=["key"]).size()
    df_inter_grouped = ddf_inter_grouped.compute()
@@ -73,6 +90,8 @@ if numfiles != 0:

        # Calc peak percentage
        peak_perc = (overlap_peaks / total_peaks) * 100
else:
    print('Empty file detected')

# Create string and write to file
output_string = str(peak_perc)
+6 −0
Original line number Diff line number Diff line
@@ -252,6 +252,12 @@ params {
            publish_files = false
        }

        "calc_peak_repro_cut" {
            args          = "-f 1,2,3,6"
            suffix        = ".repro"
            ext           = "bed"
        }

        "calc_peak_repro" {
            publish_files = false
        }
+41 −13
Original line number Diff line number Diff line
@@ -299,6 +299,7 @@ include { AWK as AWK_EDIT_PEAK_BED } from "../modules/local/linux/awk"
include { AWK as AWK_FRAG_BIN             } from "../modules/local/linux/awk"                          addParams( options: modules["awk_frag_bin"]        )
include { SAMTOOLS_CUSTOMVIEW             } from "../modules/local/samtools_custom_view"               addParams( options: modules["samtools_frag_len"]   )
include { CALCULATE_FRIP                  } from "../modules/local/modules/calculate_frip/main"        addParams( options: modules["calc_frip"]           )
include { CUT as CUT_CALC_REPROD          } from "../modules/local/linux/cut"                          addParams( options: modules["calc_peak_repro_cut"] )
include { CALCULATE_PEAK_REPROD           } from "../modules/local/modules/calculate_peak_reprod/main" addParams( options: modules["calc_peak_repro"]     )
include { EXPORT_META                     } from "../modules/local/export_meta"                        addParams( options: modules["export_meta"]         )
include { EXPORT_META as EXPORT_META_CTRL } from "../modules/local/export_meta"                        addParams( options: modules["export_meta"]         )
@@ -984,10 +985,37 @@ workflow CUTANDRUN {
        ch_samtools_bam      = ANNOTATE_FRIP_META.out.output
        //ch_samtools_bam | view

        /*
        * MODULE: Trim unwanted columns for downstream reporting
        */
        CUT_CALC_REPROD (
            AWK_NAME_PEAK_BED.out.file
        )

        /*
        * CHANNEL: Group samples based on group
        */
        CUT_CALC_REPROD.out.file
            .map { row -> [ row[0].group, row[1] ] }
            .groupTuple(by: [0])
            .map { row ->
                new_meta = [:]
                new_meta.put( "id", row[0] )
                [ new_meta, row[1].flatten() ]
            }
            .map { row ->
                [ row[0], row[1], row[1].size() ]
            }
            .filter { row -> row[2] > 1 }
            .map { row ->
                [ row[0], row[1] ]
            }
        .set { ch_seacr_bed_group_2 }

        /*
        * CHANNEL: Per group, create a channel per one against all combination
        */
        ch_seacr_bed_group
        ch_seacr_bed_group_2
            .flatMap{
                row ->
                new_output = []