Unverified Commit 3cbd828f authored by nchernia's avatar nchernia Committed by GitHub
Browse files

Merge pull request #240 from aidenlab/single_dedup

Single dedup
parents b19b7a77 1229e3b0
Loading
Loading
Loading
Loading
+43 −8
Original line number Diff line number Diff line
@@ -253,7 +253,7 @@ $0 !~ /^@/{
	    pos[j] = 1;
	  }
	}
	else if (str[j] == 0 && notprimary[j]==1 && length(singleend)>0) {
	else if (str[j] == 0 && notprimary[j]==256 && length(singleend)>0) {
          # count Ms,Ds,Ns,Xs,=s for sequence length
	  seqlength=0;
	  currstr=tmp[6];
@@ -596,7 +596,7 @@ $0 !~ /^@/{
	      interiorpos1 = adjust(pos[read1],str[read1],cigarstr[read1],notprimary[read1]);
	      interiorpos2 = adjust(pos[read2],str[read2],cigarstr[read2],notprimary[read2]);
	      sortpos=sprintf("%0" chrlen "d_%0" chrlen "d",interiorpos1,interiorpos2);
	      cb_str = "cb:Z:"chr[read1]"_"chr[2]"_"fragstr"_"str[read1]"_"outputstr"_"sortpos;
	      cb_str = "cb:Z:"chr[read1]"_"chr[read2]"_"fragstr"_"str[read1]"_"outputstr"_"sortpos;

              # assign mate mapping quality, "read type", "interior position", "mate interior position"
	      # unclear for singleend what the read type should be; both 0/1 and 2/3 are possible
@@ -643,11 +643,29 @@ $0 !~ /^@/{
	split(c[i], tmp);
        # blacklist - if 3rd bit set (=4) it means unmapped
        mapped[j] = and(tmp[2],4) == 0;
        str[j] = and(tmp[2],16);
        # chromosome
        chr[j] = tmp[3];
        # get rid of "_" in chromosome name, for cb_str
        gsub(/_/, "", chr[j]);
        # position
        pos[j] = tmp[4];
        # cigar string
        cigarstr[j] = tmp[6];
      }
      if (mapped[0]) { 
      if (mapped[0] && length(singleend)>0) { 
	count_singleton++;
	# get 3' end for second end for dedupping
	interiorpos1 = adjust(pos[0],str[0],cigarstr[0],0);
	sortpos=sprintf("%0" chrlen "d_%0" chrlen "d",pos[0],interiorpos1);
	cb_str = "cb:Z:"chr[0]"_"chr[0]"_0_2_"str[0]"_"str[0]"_"sortpos;
	# read type 7 is singleton reads
	print c[1],"rt:A:7",cb_str;
      }
      else if (mapped[0]) {
	count_singleton++;
	for (i in c) {
	  print c[i],"rt:A:8";
	  print c[i],"rt:A:7";
	}
      }
      else {
@@ -719,7 +737,7 @@ END{
	    pos[j] = 1;
	  }
	}
	else if (str[j] == 0 && notprimary[j]==1 && length(singleend)>0) {
	else if (str[j] == 0 && notprimary[j]==256 && length(singleend)>0) {
          # count Ms,Ds,Ns,Xs,=s for sequence length
	  seqlength=0;
	  currstr=tmp[6];
@@ -1106,14 +1124,31 @@ END{
      j=0;
      for (i in c) {
	split(c[i], tmp);
	split(tmp[1],readname,"/");
        # blacklist - if 3rd bit set (=4) it means unmapped
        mapped[j] = and(tmp[2],4) == 0;
        str[j] = and(tmp[2],16);
        # chromosome
        chr[j] = tmp[3];
        # get rid of "_" in chromosome name, for cb_str
        gsub(/_/, "", chr[j]);
        # position
        pos[j] = tmp[4];
        # cigar string
        cigarstr[j] = tmp[6];
      }
      if (mapped[0]) { 
      if (mapped[0] && length(singleend)>0) { 
	count_singleton++;
	# get 3' end for second end for dedupping
	interiorpos1 = adjust(pos[0],str[0],cigarstr[0],0);
	sortpos=sprintf("%0" chrlen "d_%0" chrlen "d",pos[0],interiorpos1);
	cb_str = "cb:Z:"chr[0]"_"chr[0]"_0_1_"str[0]"_"str[0]"_"sortpos;
	# read type 7 is singleton reads
	print c[1],"rt:A:7",cb_str;
      }
      else if (mapped[0]) {
	count_singleton++;
	for (i in c) {
	  print c[i],"rt:A:8";
	  print c[i],"rt:A:7";
	}
      }
      else {
+1 −0
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@
# Script to clean up big repetitive files and zip fastqs. Run after you are 
# sure the pipeline ran successfully.  Run from top directory (HIC001 e.g.).
# Juicer version 2.0

rm aligned/merged1.txt 
rm aligned/merged30.txt 
rm -r splits 
+14 −14
Original line number Diff line number Diff line
@@ -136,14 +136,14 @@ $0 !~ /^@/ && $0 ~ /cb:/ && pname != $1 {
	    for (j=0; j<i; j++) {
		if (j in dups) {
		    count_dups++;
		    markduplicate(saved1[rname[j]]);
		    markduplicate(saved2[rname[j]]);
		    if (rname[j] in saved1) markduplicate(saved1[rname[j]]);
		    if (rname[j] in saved2) markduplicate(saved2[rname[j]]);
		    if (rname[j] in saved3) markduplicate(saved3[rname[j]]);
		    if (rname[j] in saved4) markduplicate(saved4[rname[j]]);
		}
		else {
		    print saved1[rname[j]];
		    print saved2[rname[j]];
		  if (rname[j] in saved1) print saved1[rname[j]];
		  if (rname[j] in saved2) print saved2[rname[j]];
		  if (rname[j] in saved3) print saved3[rname[j]];
		  if (rname[j] in saved4) print saved4[rname[j]];
		}
@@ -155,8 +155,8 @@ $0 !~ /^@/ && $0 ~ /cb:/ && pname != $1 {
	}
	# size of potential duplicate array is 1, by definition not a duplicate
	else if (i==1) {
	    print saved1[rname[0]];
	    print saved2[rname[0]];
	    if (rname[0] in saved1) print saved1[rname[0]];
	    if (rname[0] in saved2) print saved2[rname[0]];
	    if (rname[0] in saved3) print saved3[rname[0]];
	    if (rname[0] in saved4) print saved4[rname[0]];
	    delete saved1[rname[0]];
@@ -226,14 +226,14 @@ END {
	    for (j=0; j<i; j++) {
		if (j in dups) {
		    count_dups++;
		    markduplicate(saved1[rname[j]]);
		    markduplicate(saved2[rname[j]]);
		    if (rname[j] in saved1) markduplicate(saved1[rname[j]]);
		    if (rname[j] in saved2) markduplicate(saved2[rname[j]]);
		    if (rname[j] in saved3) markduplicate(saved3[rname[j]]);
		    if (rname[j] in saved4) markduplicate(saved4[rname[j]]);
		}
		else {
		    print saved1[rname[j]];
		    print saved2[rname[j]];
		    if (rname[j] in saved1) print saved1[rname[j]];
		    if (rname[j] in saved2) print saved2[rname[j]];
		    if (rname[j] in saved3) print saved3[rname[j]];
		    if (rname[j] in saved4) print saved4[rname[j]];
		}
@@ -241,8 +241,8 @@ END {
	}
	# size of potential duplicate array is 1, by definition not a duplicate
	else if (i==1) {
	    print saved1[rname[0]];
	    print saved2[rname[0]];
	    if (rname[0] in saved1) print saved1[rname[0]];
	    if (rname[0] in saved2) print saved2[rname[0]];
	    if (rname[0] in saved3) print saved3[rname[0]];
	    if (rname[0] in saved4) print saved4[rname[0]];
	}
+22 −6
Original line number Diff line number Diff line
@@ -27,6 +27,15 @@ processme != 0 {
	saved_mq = $5;
	saved_cigar = $6;
	saved_chr = $3;
	if (length(singleend)>0) {
	    for (ind=12; ind<=NF; ind++) {
		if ($(ind) ~ /^ep:/) {
		    split($(ind), ep_str, ":");
		}
	    }
	    saved_ext = int(ep_str[3]);
	}
	else saved_ext = "";
	next;
    }
    else {
@@ -54,10 +63,11 @@ processme != 0 {
	    else if ($i ~ /^mp/) {
		split($i, mp, ":");
	    }
	    else if ($i ~ /^rt:/) {
		split($i, rt, ":");
	    }
	for (ind=12; ind<=NF; ind++) {
	    if ($(ind) ~ /^rt:/) {
		split($(ind), rt, ":");
	    else if ($i ~ /^ep:/) {
		split($i, ep_str, ":");
	    }
	}
	if (rt[3]%2==0) {
@@ -87,8 +97,14 @@ processme != 0 {

	# merged_nodups
	if (use_external_pos) {
	    if (length(singleend)>0) {
		pos1 = saved_ext;
		pos2 = int(ep_str[3]);
	    }
	    else { 
		pos1 = extpos1;
		pos2 = extpos2;
	    }
	    print str1, chr1, pos1, frag1, str2, chr2, pos2, frag2, mq1, cigar1, seq1, mq2, cigar2, seq2, $1, $1;
	}
	else {
+51 −9
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@ BEGIN {
  if (length(fname)>0) {
    while (getline < fname) {
      dups=$1;
      singdups=$2;
    }
  }
}
@@ -44,16 +45,57 @@ BEGIN {
}
END{
  if (tot==0) tot=1;
  printf("Sequenced Read Pairs:  %'d\n Normal Paired: %'d (%0.2f%)\n Chimeric Paired: %'d (%0.2f%)\n Chimeric Ambiguous: %'d (%0.2f%)\n Unmapped: %'d (%0.2f%)\n", tot, norm, norm*100/tot, chim, chim*100/tot, coll, coll*100/tot, unm, unm*100/tot);
  if (length(singleend)==0) {
      printf("Read type: Paired End\n");
      printf("Sequenced Read Pairs:  %'d\n", tot);
      # in the future, 1 alignment - on one end / substantially overlapping in sum
      printf("No chimera found: %'d (%0.2f%)\n",  unm, unm*100/tot);
      printf(" One or both reads unmapped: %'d (%0.2f%)\n",  unm, unm*100/tot);
      # 1 alignment would go here and be in sum of "no chimera found"
      printf("2 alignments: %'d (%0.2f%)\n", norm+chim, (norm+chim)*100/tot);
      printf(" 2 alignments (A...B): %'d (%0.2f%)\n", norm, norm*100/tot);
      printf(" 2 alignments (A1….A2B; A1B2...B1A2): %'d (%0.2f%)\n", chim, chim*100/tot);
      printf("3 or more alignments: %'d (%0.2f%)\n", coll, coll*100/tot);
     if (ligation ~ /XXXX/) {
       printf("Ligation Motif Present: N/A\n");
     }
     else {
       printf("Ligation Motif Present: %'d (%0.2f%)\n", lig, lig*100/tot);
     }
     printf("Average insert size: %0.2f\n",insertsize/NR);
     alignable=norm+chim;
  printf(" Single Alignment: %'d (%0.2f%)\n Average insert size: %0.2f\nAlignable (Normal+Chimeric Paired): %'d (%0.2f%)\n",  singleton, singleton*100/tot, insertsize/NR, alignable, alignable*100/tot);
     uniq=alignable-dups;
     if (alignable==0) alignable=1;
  printf("Unique Reads: %'d (%0.2f%, %0.2f%)\nDuplicates: %'d (%0.2f%, %0.2f%)\n", uniq, uniq*100/alignable, uniq*100/tot, dups, dups*100/alignable, dups*100/tot);
     printf("Total Unique: %'d (%0.2f%, %0.2f%)\nTotal Duplicates: %'d (%0.2f%, %0.2f%)\n", uniq, uniq*100/alignable, uniq*100/tot, dups, dups*100/alignable, dups*100/tot);
  }
  else {
      printf("Read type: Single End\n");
      printf("Sequenced Reads:  %'d\n", tot);
      # in the future, 1 alignment - on one end / substantially overlapping in sum
      printf("No chimera found: %'d (%0.2f%)\n",  unm+singleton, (unm+singleton)*100/tot);
      printf(" 0 alignments: %'d (%0.2f%)\n",  unm, unm*100/tot);
      printf(" 1 alignment: %'d (%0.2f%)\n",  singleton, singleton*100/tot);
      printf("2 alignments: %'d (%0.2f%)\n", chim, chim*100/tot);
      printf("3 or more alignments: %'d (%0.2f%)\n", coll, coll*100/tot);
     if (ligation ~ /XXXX/) {
       printf("Ligation Motif Present: N/A\n");
     }
     else {
       printf("Ligation Motif Present: %'d (%0.2f%)\n", lig, lig*100/tot);
     }
     singuniq=singleton-singdups;
     if (singleton==0) singleton=1;
     printf("1 alignment unique: %'d (%0.2f%, %0.2f%)\n", singuniq, singuniq*100/singleton, singuniq*100/tot);
     printf("1 alignment duplicates: %'d (%0.2f%, %0.2f%)\n", singdups, singdups*100/singleton, singdups*100/tot);
     #Library Complexity Estimate (1 alignment)*
     alignable=norm+chim;
     uniq=alignable-dups;
     if (alignable==0) alignable=1;
     printf("2 alignment unique: %'d (%0.2f%, %0.2f%)\n", uniq, uniq*100/alignable, uniq*100/tot);
     printf("2 alignment duplicates: %'d (%0.2f%, %0.2f%)\n", dups, dups*100/alignable, dups*100/tot);
     #Library Complexity Estimate (2 alignment)*
     printf("Total Unique: %'d (%0.2f%, %0.2f%)\n", uniq+singuniq, (uniq+singuniq)*100/(singleton+alignable), (uniq+singuniq)*100/tot);
     printf("Total Duplicates: %'d (%0.2f%, %0.2f%)\n", dups+singdups, (dups+singdups)*100/(singleton+alignable), (dups+singdups)*100/tot);
     #Library Complexity Estimate (1+2 above)*
  }
}
Loading