Merge pull request #240 from aidenlab/single_dedup (3cbd828f) · Commits · Chaos / chaos.juicer

CPU/common/chimeric_sam.awk

+43 −8

Original line number	Diff line number	Diff line
		@@ -253,7 +253,7 @@ $0 !~ /^@/{
		pos[j] = 1;
		}
		}
		else if (str[j] == 0 && notprimary[j]==1 && length(singleend)>0) {
		else if (str[j] == 0 && notprimary[j]==256 && length(singleend)>0) {
		# count Ms,Ds,Ns,Xs,=s for sequence length
		seqlength=0;
		currstr=tmp[6];
		@@ -596,7 +596,7 @@ $0 !~ /^@/{
		interiorpos1 = adjust(pos[read1],str[read1],cigarstr[read1],notprimary[read1]);
		interiorpos2 = adjust(pos[read2],str[read2],cigarstr[read2],notprimary[read2]);
		sortpos=sprintf("%0" chrlen "d_%0" chrlen "d",interiorpos1,interiorpos2);
		cb_str = "cb:Z:"chr[read1]"_"chr[2]"_"fragstr"_"str[read1]"_"outputstr"_"sortpos;
		cb_str = "cb:Z:"chr[read1]"_"chr[read2]"_"fragstr"_"str[read1]"_"outputstr"_"sortpos;

		# assign mate mapping quality, "read type", "interior position", "mate interior position"
		# unclear for singleend what the read type should be; both 0/1 and 2/3 are possible
		@@ -643,11 +643,29 @@ $0 !~ /^@/{
		split(c[i], tmp);
		# blacklist - if 3rd bit set (=4) it means unmapped
		mapped[j] = and(tmp[2],4) == 0;
		str[j] = and(tmp[2],16);
		# chromosome
		chr[j] = tmp[3];
		# get rid of "_" in chromosome name, for cb_str
		gsub(/_/, "", chr[j]);
		# position
		pos[j] = tmp[4];
		# cigar string
		cigarstr[j] = tmp[6];
		}
		if (mapped[0]) {
		if (mapped[0] && length(singleend)>0) {
		count_singleton++;
		# get 3' end for second end for dedupping
		interiorpos1 = adjust(pos[0],str[0],cigarstr[0],0);
		sortpos=sprintf("%0" chrlen "d_%0" chrlen "d",pos[0],interiorpos1);
		cb_str = "cb:Z:"chr[0]"_"chr[0]"_0_2_"str[0]"_"str[0]"_"sortpos;
		# read type 7 is singleton reads
		print c[1],"rt:A:7",cb_str;
		}
		else if (mapped[0]) {
		count_singleton++;
		for (i in c) {
		print c[i],"rt:A:8";
		print c[i],"rt:A:7";
		}
		}
		else {
		@@ -719,7 +737,7 @@ END{
		pos[j] = 1;
		}
		}
		else if (str[j] == 0 && notprimary[j]==1 && length(singleend)>0) {
		else if (str[j] == 0 && notprimary[j]==256 && length(singleend)>0) {
		# count Ms,Ds,Ns,Xs,=s for sequence length
		seqlength=0;
		currstr=tmp[6];
		@@ -1106,14 +1124,31 @@ END{
		j=0;
		for (i in c) {
		split(c[i], tmp);
		split(tmp[1],readname,"/");
		# blacklist - if 3rd bit set (=4) it means unmapped
		mapped[j] = and(tmp[2],4) == 0;
		str[j] = and(tmp[2],16);
		# chromosome
		chr[j] = tmp[3];
		# get rid of "_" in chromosome name, for cb_str
		gsub(/_/, "", chr[j]);
		# position
		pos[j] = tmp[4];
		# cigar string
		cigarstr[j] = tmp[6];
		}
		if (mapped[0]) {
		if (mapped[0] && length(singleend)>0) {
		count_singleton++;
		# get 3' end for second end for dedupping
		interiorpos1 = adjust(pos[0],str[0],cigarstr[0],0);
		sortpos=sprintf("%0" chrlen "d_%0" chrlen "d",pos[0],interiorpos1);
		cb_str = "cb:Z:"chr[0]"_"chr[0]"_0_1_"str[0]"_"str[0]"_"sortpos;
		# read type 7 is singleton reads
		print c[1],"rt:A:7",cb_str;
		}
		else if (mapped[0]) {
		count_singleton++;
		for (i in c) {
		print c[i],"rt:A:8";
		print c[i],"rt:A:7";
		}
		}
		else {

CPU/common/cleanup.sh

+1 −0

Original line number	Diff line number	Diff line
		@@ -25,6 +25,7 @@
		# Script to clean up big repetitive files and zip fastqs. Run after you are
		# sure the pipeline ran successfully. Run from top directory (HIC001 e.g.).
		# Juicer version 2.0

		rm aligned/merged1.txt
		rm aligned/merged30.txt
		rm -r splits

CPU/common/dups_sam.awk

+14 −14

Original line number	Diff line number	Diff line
		@@ -136,14 +136,14 @@ $0 !~ /^@/ && $0 ~ /cb:/ && pname != $1 {
		for (j=0; j<i; j++) {
		if (j in dups) {
		count_dups++;
		markduplicate(saved1[rname[j]]);
		markduplicate(saved2[rname[j]]);
		if (rname[j] in saved1) markduplicate(saved1[rname[j]]);
		if (rname[j] in saved2) markduplicate(saved2[rname[j]]);
		if (rname[j] in saved3) markduplicate(saved3[rname[j]]);
		if (rname[j] in saved4) markduplicate(saved4[rname[j]]);
		}
		else {
		print saved1[rname[j]];
		print saved2[rname[j]];
		if (rname[j] in saved1) print saved1[rname[j]];
		if (rname[j] in saved2) print saved2[rname[j]];
		if (rname[j] in saved3) print saved3[rname[j]];
		if (rname[j] in saved4) print saved4[rname[j]];
		}
		@@ -155,8 +155,8 @@ $0 !~ /^@/ && $0 ~ /cb:/ && pname != $1 {
		}
		# size of potential duplicate array is 1, by definition not a duplicate
		else if (i==1) {
		print saved1[rname[0]];
		print saved2[rname[0]];
		if (rname[0] in saved1) print saved1[rname[0]];
		if (rname[0] in saved2) print saved2[rname[0]];
		if (rname[0] in saved3) print saved3[rname[0]];
		if (rname[0] in saved4) print saved4[rname[0]];
		delete saved1[rname[0]];
		@@ -226,14 +226,14 @@ END {
		for (j=0; j<i; j++) {
		if (j in dups) {
		count_dups++;
		markduplicate(saved1[rname[j]]);
		markduplicate(saved2[rname[j]]);
		if (rname[j] in saved1) markduplicate(saved1[rname[j]]);
		if (rname[j] in saved2) markduplicate(saved2[rname[j]]);
		if (rname[j] in saved3) markduplicate(saved3[rname[j]]);
		if (rname[j] in saved4) markduplicate(saved4[rname[j]]);
		}
		else {
		print saved1[rname[j]];
		print saved2[rname[j]];
		if (rname[j] in saved1) print saved1[rname[j]];
		if (rname[j] in saved2) print saved2[rname[j]];
		if (rname[j] in saved3) print saved3[rname[j]];
		if (rname[j] in saved4) print saved4[rname[j]];
		}
		@@ -241,8 +241,8 @@ END {
		}
		# size of potential duplicate array is 1, by definition not a duplicate
		else if (i==1) {
		print saved1[rname[0]];
		print saved2[rname[0]];
		if (rname[0] in saved1) print saved1[rname[0]];
		if (rname[0] in saved2) print saved2[rname[0]];
		if (rname[0] in saved3) print saved3[rname[0]];
		if (rname[0] in saved4) print saved4[rname[0]];
		}

CPU/common/sam_to_pre.awk

+22 −6

Original line number	Diff line number	Diff line
		@@ -27,6 +27,15 @@ processme != 0 {
		saved_mq = $5;
		saved_cigar = $6;
		saved_chr = $3;
		if (length(singleend)>0) {
		for (ind=12; ind<=NF; ind++) {
		if ($(ind) ~ /^ep:/) {
		split($(ind), ep_str, ":");
		}
		}
		saved_ext = int(ep_str[3]);
		}
		else saved_ext = "";
		next;
		}
		else {
		@@ -54,10 +63,11 @@ processme != 0 {
		else if ($i ~ /^mp/) {
		split($i, mp, ":");
		}
		else if ($i ~ /^rt:/) {
		split($i, rt, ":");
		}
		for (ind=12; ind<=NF; ind++) {
		if ($(ind) ~ /^rt:/) {
		split($(ind), rt, ":");
		else if ($i ~ /^ep:/) {
		split($i, ep_str, ":");
		}
		}
		if (rt[3]%2==0) {
		@@ -87,8 +97,14 @@ processme != 0 {

		# merged_nodups
		if (use_external_pos) {
		if (length(singleend)>0) {
		pos1 = saved_ext;
		pos2 = int(ep_str[3]);
		}
		else {
		pos1 = extpos1;
		pos2 = extpos2;
		}
		print str1, chr1, pos1, frag1, str2, chr2, pos2, frag2, mq1, cigar1, seq1, mq2, cigar2, seq2, $1, $1;
		}
		else {

CPU/common/stats_sub.awk

+51 −9

Original line number	Diff line number	Diff line
		@@ -29,6 +29,7 @@ BEGIN {
		if (length(fname)>0) {
		while (getline < fname) {
		dups=$1;
		singdups=$2;
		}
		}
		}
		@@ -44,16 +45,57 @@ BEGIN {
		}
		END{
		if (tot==0) tot=1;
		printf("Sequenced Read Pairs: %'d\n Normal Paired: %'d (%0.2f%)\n Chimeric Paired: %'d (%0.2f%)\n Chimeric Ambiguous: %'d (%0.2f%)\n Unmapped: %'d (%0.2f%)\n", tot, norm, norm100/tot, chim, chim100/tot, coll, coll100/tot, unm, unm100/tot);
		if (length(singleend)==0) {
		printf("Read type: Paired End\n");
		printf("Sequenced Read Pairs: %'d\n", tot);
		# in the future, 1 alignment - on one end / substantially overlapping in sum
		printf("No chimera found: %'d (%0.2f%)\n", unm, unm*100/tot);
		printf(" One or both reads unmapped: %'d (%0.2f%)\n", unm, unm*100/tot);
		# 1 alignment would go here and be in sum of "no chimera found"
		printf("2 alignments: %'d (%0.2f%)\n", norm+chim, (norm+chim)*100/tot);
		printf(" 2 alignments (A...B): %'d (%0.2f%)\n", norm, norm*100/tot);
		printf(" 2 alignments (A1….A2B; A1B2...B1A2): %'d (%0.2f%)\n", chim, chim*100/tot);
		printf("3 or more alignments: %'d (%0.2f%)\n", coll, coll*100/tot);
		if (ligation ~ /XXXX/) {
		printf("Ligation Motif Present: N/A\n");
		}
		else {
		printf("Ligation Motif Present: %'d (%0.2f%)\n", lig, lig*100/tot);
		}
		printf("Average insert size: %0.2f\n",insertsize/NR);
		alignable=norm+chim;
		printf(" Single Alignment: %'d (%0.2f%)\n Average insert size: %0.2f\nAlignable (Normal+Chimeric Paired): %'d (%0.2f%)\n", singleton, singleton100/tot, insertsize/NR, alignable, alignable100/tot);
		uniq=alignable-dups;
		if (alignable==0) alignable=1;
		printf("Unique Reads: %'d (%0.2f%, %0.2f%)\nDuplicates: %'d (%0.2f%, %0.2f%)\n", uniq, uniq100/alignable, uniq100/tot, dups, dups100/alignable, dups100/tot);
		printf("Total Unique: %'d (%0.2f%, %0.2f%)\nTotal Duplicates: %'d (%0.2f%, %0.2f%)\n", uniq, uniq100/alignable, uniq100/tot, dups, dups100/alignable, dups100/tot);
		}
		else {
		printf("Read type: Single End\n");
		printf("Sequenced Reads: %'d\n", tot);
		# in the future, 1 alignment - on one end / substantially overlapping in sum
		printf("No chimera found: %'d (%0.2f%)\n", unm+singleton, (unm+singleton)*100/tot);
		printf(" 0 alignments: %'d (%0.2f%)\n", unm, unm*100/tot);
		printf(" 1 alignment: %'d (%0.2f%)\n", singleton, singleton*100/tot);
		printf("2 alignments: %'d (%0.2f%)\n", chim, chim*100/tot);
		printf("3 or more alignments: %'d (%0.2f%)\n", coll, coll*100/tot);
		if (ligation ~ /XXXX/) {
		printf("Ligation Motif Present: N/A\n");
		}
		else {
		printf("Ligation Motif Present: %'d (%0.2f%)\n", lig, lig*100/tot);
		}
		singuniq=singleton-singdups;
		if (singleton==0) singleton=1;
		printf("1 alignment unique: %'d (%0.2f%, %0.2f%)\n", singuniq, singuniq100/singleton, singuniq100/tot);
		printf("1 alignment duplicates: %'d (%0.2f%, %0.2f%)\n", singdups, singdups100/singleton, singdups100/tot);
		#Library Complexity Estimate (1 alignment)*
		alignable=norm+chim;
		uniq=alignable-dups;
		if (alignable==0) alignable=1;
		printf("2 alignment unique: %'d (%0.2f%, %0.2f%)\n", uniq, uniq100/alignable, uniq100/tot);
		printf("2 alignment duplicates: %'d (%0.2f%, %0.2f%)\n", dups, dups100/alignable, dups100/tot);
		#Library Complexity Estimate (2 alignment)*
		printf("Total Unique: %'d (%0.2f%, %0.2f%)\n", uniq+singuniq, (uniq+singuniq)100/(singleton+alignable), (uniq+singuniq)100/tot);
		printf("Total Duplicates: %'d (%0.2f%, %0.2f%)\n", dups+singdups, (dups+singdups)100/(singleton+alignable), (dups+singdups)100/tot);
		#Library Complexity Estimate (1+2 above)*
		}
		}

Admin message