Unverified Commit 7360728b authored by Chris Cheshire's avatar Chris Cheshire Committed by GitHub
Browse files

Merge pull request #72 from luslab/dev

Fixes
parents aeeff25d bf2e90af
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -23,7 +23,7 @@ jobs:
    strategy:
      matrix:
        # Nextflow versions: check pipeline minimum and current latest
        nxf_ver: ['21.04.0', '']
        nxf_ver: ['21.04.3', '']
    steps:
      - name: Check out pipeline code
        uses: actions/checkout@v2
@@ -62,7 +62,7 @@ jobs:
    strategy:
      matrix:
        # We only run this on the specified nextflow version for the pipeline
        nxf_ver: ['21.04.0']
        nxf_ver: ['']
    steps:
      - name: Check out pipeline code
        uses: actions/checkout@v2
@@ -136,7 +136,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
        nxf_version: ["21.04.0"]
        nxf_version: ["21.10.3"]
        tags: ["samplesheet", "verify_output_input", "verify_output_align_short", "verify_output_align_long", "verify_output_peak_calling", "verify_output_reporting", "verify_output_save", "verify_output_skip"]
    steps:
      - uses: actions/checkout@v2
+2 −2
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@
[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/cutandrun/results)
[![DOI](http://img.shields.io/badge/DOI-10.5281/zenodo.5653535-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.5653535)

[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/)
[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.3-23aa62.svg?labelColor=000000)](https://www.nextflow.io/)
[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
@@ -45,7 +45,7 @@ The pipeline has been developed with continuous integration (CI) in mind. nf-cor

## Quick Start

1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.04.0`)
1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.04.3`)

2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_

+21 −9
Original line number Diff line number Diff line
@@ -37,6 +37,17 @@ peak_file_list = glob.glob(args.peaks)

frips = []
for idx, bam_file in enumerate(bam_file_list):
    # Init
    frip = 0

    # Read first line
    first_line = None
    with open(peak_file_list[idx], "r") as file:
        for line in file:
            first_line = line
            break

    if first_line is not None:
        print("Calculating " + bam_file + " using " + peak_file_list[idx])
        cr = crpb.CountReadsPerBin([bam_file], bedFile=[peak_file_list[idx]], numberOfProcessors=int(args.threads))

@@ -49,6 +60,7 @@ for idx, bam_file in enumerate(bam_file_list):

        # Calc frip
        frip = float(total[0]) / bam.mapped

    frips.append(str(frip))

    # Log
+33 −8
Original line number Diff line number Diff line
@@ -129,10 +129,22 @@ class Reports:
            # Create dataframe from csv file for each file and save to a list
            dt_frag_i = pd.read_csv(dt_frag_list[i], sep='\t', header=None, names=['Size','Occurrences'])
            frag_base_i = os.path.basename(dt_frag_list[i])
            sample_id = frag_base_i.split(".")[0]
            sample_id_split = sample_id.rsplit("_", 1)
            rep_i = sample_id_split[1]
            group_i = sample_id_split[0]

            #  split txt files on dots
            sample_id_list = frag_base_i.split(".")

            # join list on the elements of the sample id
            separator = ""
            sample_id = separator.join(sample_id_list[0:-2])

            # split sample id on underscores
            sample_id_split_list = sample_id.split("_")

            #  take first element of this list for group id
            group_i = separator.join(sample_id_split_list[0:-1])

            #  take last element of this list for replicate number
            rep_i = sample_id_split_list[-1]

            # Create long forms of fragment histograms
            dt_frag_i_long = np.repeat(dt_frag_i['Size'].values, dt_frag_i['Occurrences'].values)
@@ -203,10 +215,23 @@ class Reports:
        for i in list(range(len(seacr_bed_list))):
            seacr_bed_i = pd.read_csv(seacr_bed_list[i], sep='\t', header=None, usecols=[0,1,2,3,4], names=['chrom','start','end','total_signal','max_signal'])
            bed_base_i = os.path.basename(seacr_bed_list[i])
            sample_id = bed_base_i.split(".")[0]
            sample_id_split = sample_id.rsplit("_", 1)
            rep_i = sample_id_split[1]
            group_i = sample_id_split[0]

            #  split bed files on dots
            bed_id_list = bed_base_i.split(".")

            # join list on the elements of the sample id
            separator = ""
            sample_id = separator.join(bed_id_list[0:-4])

            # split sample id on underscores
            sample_id_split_list = sample_id.split("_")

            #  take first element of this list for group id
            group_i = separator.join(sample_id_split_list[0:-1])

            # take last element fo this list for replicate number
            rep_i = sample_id_split_list[-1]

            seacr_bed_i['group'] = np.repeat(group_i, seacr_bed_i.shape[0])
            seacr_bed_i['replicate'] = np.repeat(rep_i, seacr_bed_i.shape[0])

+33 −14
Original line number Diff line number Diff line
@@ -31,27 +31,44 @@ args = parser.parse_args()

# Init
peak_perc = 0
numfiles = 0
num_columns = 0

print('Reading file')

# Read file in using dask
ddf_inter = dd.read_csv(args.intersect, sep='\t', header=None, names=['chrom','start','end','overlap_1','key','a_name','b_name','count'],
    dtype={'chrom':str,'start':np.int64,'end':np.int64,'overlap_1':np.float64,'key':np.float64,'a_name':str,'b_name':str,'count':np.int64})
# Read first line
first_line = None
with open(args.intersect, "r") as file:
    for line in file:
        first_line = line
        break

if first_line is not None:
    first_line_split = first_line.split('\t')
    num_columns = len(first_line_split)
    numfiles = 1

if numfiles != 0:
    print('Number of columns: ' + str(num_columns))

# Find number of files
numfiles = ddf_inter['b_name'].max().compute()
    ddf_inter = None
    if num_columns == 6:
        # Read file in using dask
        ddf_inter = dd.read_csv(args.intersect, sep='\t', header=None, names=['chrom','start','end','key','file_num','count'],
            dtype={'chrom':str,'start':np.int64,'end':np.int64,'key':str, 'file_num':np.int32, 'count':np.int32})
        numfiles = ddf_inter['file_num'].max().compute()

# Check for table format
if isinstance(numfiles, str):
    print('Detected single file, reloading table')
    elif num_columns == 5:
        # Read file in using dask
        ddf_inter = dd.read_csv(args.intersect, sep='\t', header=None, names=['chrom','start','end','key','count'],
            dtype={'chrom':str,'start':np.int64,'end':np.int64,'key':str, 'file_num':np.int32, 'count':np.int32})
        numfiles = 1
    ddf_inter = dd.read_csv(args.intersect, sep='\t', header=None, names=['chrom','start','end','overlap_1','overlap_2','key','name','count'],
        dtype={'chrom':str,'start':np.int64,'end':np.int64,'overlap_1':np.float64,'overlap_2':np.float64,'key':str,'name':str,'count':np.int64})
    else:
        print('Invalid file format detected')
        exit(1)

    print('Number of files: ' + str(numfiles))

# Check for empty file
if numfiles != 0:
    # Find total number of peaks
    ddf_inter_grouped = ddf_inter.groupby(by=["key"]).size()
    df_inter_grouped = ddf_inter_grouped.compute()
@@ -73,6 +90,8 @@ if numfiles != 0:

        # Calc peak percentage
        peak_perc = (overlap_peaks / total_peaks) * 100
else:
    print('Empty file detected')

# Create string and write to file
output_string = str(peak_perc)
Loading