Commit f0317a2f authored by Aiden Lab's avatar Aiden Lab
Browse files

mostly working version of 1.6

parent 32f56437
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -29,7 +29,7 @@

# Start by checking the statistics to see if they add up 
res1=`cat ${splitdir}/*norm*res*`
check1=`cat ${splitdir}/*norm*res* | awk '{s2+=$2; s3+=$3; s4+=$4; s5+=$5; s6+=$6}END{if (s2 != s3+s4+s5+s6){print 0}else{print 1}}'`
check1=`cat ${splitdir}/*norm*res* | awk '{s2+=$2; s3+=$3; s4+=$4; s5+=$5; s6+=$6; s7+=$7}END{if (s2 != s3+s4+s5+s6+s7){print 0}else{print 1}}'`
if [ $check1 -eq 0 ] || [ -z "$res1" ]
then
    echo "***! Error! The statistics do not add up. Alignment likely failed to complete on one or more files. Run relaunch_prep.sh"
+435 −396
Original line number Diff line number Diff line
@@ -23,24 +23,24 @@
#  THE SOFTWARE.
##########
# Takes a SAM file, looks for chimeric reads and normal reads.  Outputs  
# only those reads mapped to chromosomes 1-24 with MAPQ > 0.
# only those reads mapped with MAPQ > 0
#
# Output to fname1 is of the form:
# strand1 chromosome1 position1 frag1 strand2 chromosome2 position2 frag2
#  mapq1 cigar1 seq1 mapq2 cigar2 seq2
#   where chr1 is always <= chr2
# 
# Output to fname2 retains SAM format.
# SAMs are also outputted.
#
# Chimeric reads are treated as follows:
# "Normal" chimeras (with 3 reads, two with the ligation junction), where the
# third read (w/o ligation junction) is within 20Kbp of one of the ligation
# junction reads, are sent to fname1.  All other chimeras (where either there
# junction reads, are sent to fname"_norm".  All other chimeras (where either there
# are more than 3 reads, or they don't contain the ligation junction, or the
# one non-ligation junction end is far from the others, are sent to fname2.
# one non-ligation junction end is far from the others, are sent to fname"_abnorm".
#
# awk -f chimeric_blacklist.awk -v fname1="norm_chimera" fname2="abnorm_chimera" fname3="unmapped"
# Juicer version 1.5 
# awk -f chimeric_blacklist.awk -v fname="stem"
# Juicer version 1.6

# returns absolute value
function abs(value)
@@ -73,11 +73,35 @@ function less_than(s1,c1,p1,s2,c2,p2)
BEGIN{
  OFS="\t";
  tottot = -1; # will count first non-group
  count_unmapped = -1; # will count first non-group
  norm=fname"_norm.txt"
  alignable=fname"_alignable.sam"
  abnorm=fname"_abnorm.sam"
  unmapped=fname"_unmapped.sam"
  mapq0=fname"_mapq0.sam"
  linenum=0
}
$0 ~ /^@/{
  # print SAM header to SAM files
  print > alignable;
  linenum++;
  print > abnorm;
  print > unmapped;
  print > mapq0;
}
$0 !~ /^@/{
  if (tottot == -1) {
    print "@PG", "ID:Juicer", "VN:1.6" > alignable;
    linenum++;
    print "@PG", "ID:Juicer", "VN:1.6" > abnorm;
    print "@PG", "ID:Juicer", "VN:1.6" > unmapped;
    print "@PG", "ID:Juicer", "VN:1.6" > mapq0;
  }
{
  # input file is sorted by read name.  Look at read name to group 
  # appropriately
  # we don't need this with paired end but might as well keep it
  split($1,a,"/"); 

  if(a[1]==prev){
    # move on to next record.  look below this block for actions that occur
    # regardless.
@@ -86,12 +110,55 @@ BEGIN{
  else {
    # deal with read pair group
    tottot++;
    if (count==3) {
      # chimeric read
      for (j=1; j <= 3; j++) {
    normal=0; # assume not normal, will get set if normal
    # first take care of unmapped reads
    # blacklist - if 3rd bit set (=4) it means unmapped
    sum_mapped = 0;
    for (j=1; j <= count; j++) {
        split(c[j],tmp);
	mapped[j] = and(tmp[2],4) == 0;
	sum_mapped = sum_mapped+mapped[j];
    }
    if (sum_mapped < 2) {
      # unmapped
      for (j=1; j <= count; j++) {
	print c[j] >> unmapped;
      }
      # otherwise first time through will count extra unmapped read
      count_unmapped++;
    }
    else {
      # find minimum mapq; if it's 0, send to mapq0 file
      minmapq=100;
      for (j=1; j <= count; j++) {
        split(c[j],tmp);
	split(tmp[1],readname,"/");
	read[j] = readname[2];
	if (tmp[5] < minmapq) {
          minmapq=tmp[5];
	}
      }
      if (minmapq == 0) {
	  count_mapq0++;
        for (j=1; j <= count; j++) {
          print c[j] >> mapq0;
	}
      }
      else if ((count > 4) || (count == 1)) {
        # count==1 shouldn't happen; occurs with alternate aligners at times
	for (j=1; j <= count; j++) {
          print c[j] >> abnorm;
	}
      }
      else {
        # count is 2,3,4.
	# 2 will be normal paired 
	# 3 will be chimeric paired or abnormal
        # (chimeric pair, depends on where the ends are)
        # 4 will be chimeric paired or abnormal
        # (chimeric pair, depends on where the ends are) 
	for (j=1; j <= count; j++) {
	  split(c[j],tmp);
	  # first in pair: 64
	  read[j] = (and(tmp[2], 64) > 0);
	  name[j] = tmp[1];
	
	  # strand; Bit 16 set means reverse strand
@@ -106,7 +173,6 @@ BEGIN{
	  cigarstr[j] = tmp[6];
	  # sequence
	  seq[j] = tmp[10];
        #qual[j] = tmp[11];
	
	  # get rid of soft clipping to know correct position
	  if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
@@ -151,26 +217,67 @@ BEGIN{
	    if (chr[j] ~ /MT/ && pos[j] >= 16569) {
	      pos[j] = pos[j] - 16569;
	    }
	  } # else if str[j]==16
	} # end for loop, now have all info about each read end

	if (sum_mapped == 2) {
	  # take these as the reads
	  read1=0;
	  read2=0;
	  for (j=1; j <= count; j++) {
	    if (mapped[j] && !read1) read1=j;
	    else if (mapped[j] && read1) read2=j;
	  }
	  normal=1;
	}
	else if (sum_mapped == 4) {
	  # looking for A/B...A/B
	  # will always be first in pair, second in pair
	  if (read[1] == read[2] && read[3] == read[4] && read[1] != read[3]) {
	    dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]);
	    dist[24] = abs(chr[2]-chr[4])*10000000 + abs(pos[2]-pos[4]);
	    dist[14] = abs(chr[1]-chr[4])*10000000 + abs(pos[1]-pos[4]);
	    dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]);
		
	    # A/B, A/B
	    if ((dist[13] < 1000 && dist[24] < 1000) || (dist[14] < 1000 && dist[23] < 1000)) {
	      read1 = 1;
	      read2 = 2;
	      normal = 1;
	    }
        # blacklist - if 3rd bit set (=4) it means unmapped
        mapped[j] = and(tmp[2],4) == 0; 
	    else {
	      # abnormal
	      for (j=1; j <= count; j++) {
		print c[j] >> abnorm;
	      }
			
	      count_abnorm++;
	    }
	  }
	  else { # count 4, not close together
	    # abnormal
	    for (j=1; j <= count; j++) {
	      print c[j] >> abnorm;
	    }
	    count_abnorm++;
	  }
	} # if sum_mapped == 4
	else { # sum_mapped == 3
          dist[12] = abs(chr[1]-chr[2])*10000000 + abs(pos[1]-pos[2]);
	  dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]);
	  dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]);
    
	  if (min(dist[12],min(dist[23],dist[13])) < 1000) {
	# The paired ends look like A/B...B.  Make sure we take A and B.
	    # The paired ends look like A/B...B. Make sure we take A and B
	    if (read[1] == read[2]) {
	      # take the unique one "B" for sure
	      read2 = 3;
	      # take the end of "A/B" that isn't close to "B"
	      read1 = dist[13] > dist[23] ? 1:2;
	    }
	else if (read[1] == read[3]) {
	    else if (read[1] == read[3]) { # but this shouldn't happen
	      read2 = 2;
	      read1 = dist[12] > dist[23] ? 1:3;
	      print "This shouldn't happen",name[read2] > "neva_err"
	    }
	    else if (read[2] == read[3]) {
	      read2 = 1;
@@ -180,127 +287,44 @@ BEGIN{
	      printf("reads strange\n") > "/dev/stderr"
	      exit 1
	    }
	
	if (mapped[read1] && mapped[read2]) {
	  count_norm++;
	  if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
	  #  print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2],qual[read1],qual[read2] > fname1;
	    print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1;
	  }
	  else {
	  #  print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1],qual[read2],qual[read1] > fname1;
	    print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1;
	  }
	}
	else {
	  for (i in c) {
	    print c[i] > fname3;
	  }	
	  count_unmapped++;
	}
	    normal = 1;
	  }
	  else {
	    # chimeric read with the 3 ends > 1KB apart
	    count_abnorm++;
	    for (i in c) {
	  print c[i] > fname2;
	}
      }
    }
    else if (count > 3) {
      # chimeric read > 3, too many to deal with
      count_abnorm++;
      for (i in c) {
	print c[i] > fname2;
      }
    }
    else if (count == 2) {
      # code here should be same as above, but it's a "normal" read
      j = 0;
      for (i in c) {
	split(c[i], tmp);
	split(tmp[1],readname,"/");
	str[j] = and(tmp[2],16);
	chr[j] = tmp[3];
	pos[j] = tmp[4];
	m[j] = tmp[5];
	cigarstr[j] = tmp[6];
	seq[j] = tmp[10];
        #qual[j] = tmp[11];
	name[j] = tmp[1];
	
        # blacklist - if 3rd bit set (=4) it means unmapped
        mapped[j] = and(tmp[2],4) == 0; 
	
	if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
	  split(tmp[6], cigar, "S");
	  pos[j] = pos[j] - cigar[1];
	  if (pos[j] <= 0) {
	    pos[j] = 1;
	  }
	}
	else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) {
	  split(tmp[6], cigar, "H");
	  pos[j] = pos[j] - cigar[1];
	  if (pos[j] <= 0) {
	    pos[j] = 1;
	  }
	}
	else if (str[j] == 16) {
	  # count Ms,Ds,Ns,Xs,=s for sequence length 
	  seqlength=0; 
	  currstr=tmp[6];
	  where=match(currstr, /[0-9]+[M|D|N|X|=]/); 
	  while (where>0) {
	    seqlength+=substr(currstr,where,RLENGTH-1)+0;
	    currstr=substr(currstr,where+RLENGTH);
	    where=match(currstr, /[0-9]+[M|D|N|X|=]/);
	  }
	  pos[j] = pos[j] + seqlength - 1;
	  if (tmp[6] ~ /[0-9]+S$/) {
	    where = match(tmp[6],/[0-9]+S$/);
	    cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
	    pos[j] = pos[j] + cigloc;
	      print c[i] > abnorm;
	    }
	  if (tmp[6] ~ /[0-9]+H$/) {
	    where = match(tmp[6],/[0-9]+H$/);
	    cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
	    pos[j] = pos[j] + cigloc;
	  }
	  if (chr[j] ~ /MT/ && pos[j] >= 16569) {
	    pos[j] = pos[j] - 16569;
	  }
	}
	j++;
      }
      if (mapped[0] && mapped[1]) {
	} # sum_mapped == 3
	if (normal) {
	  if (sum_mapped == 2 && count==2) {
	    count_reg++;
	if (less_than(str[0],chr[0],pos[0],str[1],chr[1],pos[1])) {
	  # ideally we'll get rid of printing out cigar string at some point
	  #print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1],qual[0],qual[1] > fname1;
	  print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1;
	  }
	else {
	  #print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0],qual[1],qual[0] > fname1;
print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1;
	  else count_norm++;
	  # or do we want to print the whole read group including chimeric?
	  #print c[read1] >> alignable;
	  #linenum++;
	  #print c[read2] >> alignable;
	  #linenum++;
	  prevlinenum=linenum;
	  for (j=1; j <= count; j++) {
	    print c[j] >> alignable;
	    linenum++;
	  }

	  if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
	    print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1]"$"fname"$"(prevlinenum+1),name[read2]"$"fname"$"linenum > norm;
	  }
	  else {
	for (i in c) {
	  print c[i] > fname3;
	}	
	count_unmapped++;
      }
    }
    else if (count == 1) {
      # this actually shouldn't happen, but it happens with alternate aligners on occasion
      count_abnorm++;
      for (i in c) {
	print c[i] > fname2;
	    print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2]"$"fname"$"(prevlinenum+1),name[read1]"$"fname"$"linenum > norm;
	  }
	}
      } # else (mapq above 0) 
    } # else (sum_mapped >= 2)
    # reset variables
    delete c;
    delete tmp;
    count=1;
    prev=a[1];
  }
@@ -310,15 +334,58 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
END{
    # deal with read pair group
    tottot++;
  if (count==3) {
    # chimeric read
    for (j=1; j <= 3; j++) {
    normal=0; # assume not normal, will get set if normal
    # first take care of unmapped reads
    # blacklist - if 3rd bit set (=4) it means unmapped
    sum_mapped = 0;
    for (j=1; j <= count; j++) {
        split(c[j],tmp);
	mapped[j] = and(tmp[2],4) == 0;
	sum_mapped = sum_mapped+mapped[j];
    }
    if (sum_mapped < 2) {
      # unmapped
      for (j=1; j <= count; j++) {
	print c[j] >> unmapped;
      }
      count_unmapped++;
    }
    else {
      # find minimum mapq; if it's 0, send to mapq0 file
      minmapq=100;
      for (j=1; j <= count; j++) {
        split(c[j],tmp);
      split(tmp[1],readname,"/");
      read[j] = readname[2];
	if (tmp[5] < minmapq) {
          minmapq=tmp[5];
	}
      }
      if (minmapq == 0) {
	  count_mapq0++;
        for (j=1; j <= count; j++) {
          print c[j] >> mapq0;
	}
      }
      else if ((count > 4) || (count == 1)) {
        # count==1 shouldn't happen; occurs with alternate aligners at times
	for (j=1; j <= count; j++) {
          print c[j] >> abnorm;
	}
      }
      else {
        # count is 2,3,4.
	# 2 will be normal paired 
	# 3 will be chimeric paired or abnormal
        # (chimeric pair, depends on where the ends are)
        # 4 will be chimeric paired or abnormal
        # (chimeric pair, depends on where the ends are) 
        sum_mapped=0;
	for (j=1; j <= count; j++) {
          split(c[j], tmp);
	  # first in pair: 64
	  read[j] = (and(tmp[2], 64) > 0);
	  name[j] = tmp[1];
	
      # strand
	  # strand; Bit 16 set means reverse strand
	  str[j] = and(tmp[2],16);
	  # chromosome
	  chr[j] = tmp[3];
@@ -330,7 +397,7 @@ END{
	  cigarstr[j] = tmp[6];
	  # sequence
	  seq[j] = tmp[10];
      #qual[j] = tmp[11];
	
	  # get rid of soft clipping to know correct position
	  if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
	    split(tmp[6], cigar, "S");
@@ -339,6 +406,7 @@ END{
	      pos[j] = 1;
	    }
	  }
	  # get rid of hard clipping to know correct position
	  else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) {
	    split(tmp[6], cigar, "H");
	    pos[j] = pos[j] - cigar[1];
@@ -364,7 +432,7 @@ END{
	      cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
	      pos[j] = pos[j] + cigloc;
	    }
	if (tmp[6] ~ /[0-9]+H$/) {
	    else if (tmp[6] ~ /[0-9]+H$/) {
	      where = match(tmp[6],/[0-9]+H$/);
	      cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
	      pos[j] = pos[j] + cigloc;
@@ -373,25 +441,67 @@ END{
	    if (chr[j] ~ /MT/ && pos[j] >= 16569) {
	      pos[j] = pos[j] - 16569;
	    }
	  } # else if str[j]==16
	} # end for loop, now have all info about each read end

	if (sum_mapped == 2) {
	  # take these as the reads
	  read1=0;
	  read2=0;
	  for (j=1; j <= count; j++) {
	    if (mapped[j] && !read1) read1=j;
	    else if (mapped[j] && read1) read2=j;
	  }
	  normal=1;
	}
	else if (sum_mapped == 4) {
	  # looking for A/B...A/B
	  # will always be first in pair, second in pair
	  if (read[1] == read[2] && read[3] == read[4] && read[1] != read[3]) {
	    dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]);
	    dist[24] = abs(chr[2]-chr[4])*10000000 + abs(pos[2]-pos[4]);
	    dist[14] = abs(chr[1]-chr[4])*10000000 + abs(pos[1]-pos[4]);
	    dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]);
		
	    # A/B, A/B
	    if ((dist[13] < 1000 && dist[24] < 1000) || (dist[14] < 1000 && dist[23] < 1000)) {
	      read1 = 1;
	      read2 = 2;
	      normal = 1;
	    }
	    else {
	      # abnormal
	      for (j=1; j <= count; j++) {
		print c[j] >> abnorm;
	      }
      mapped[j] = and(tmp[2],4) == 0;
	      count_abnorm++;
	    }
    
	  } 
	  else { # count 4, not close together
	      # abnormal
	      for (j=1; j <= count; j++) {
		print c[j] >> abnorm;
	      }
	      count_abnorm++;
	  }
	} # if sum_mapped == 4
	else { # sum_mapped == 3
          dist[12] = abs(chr[1]-chr[2])*10000000 + abs(pos[1]-pos[2]);
	  dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]);
	  dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]);
    
	  if (min(dist[12],min(dist[23],dist[13])) < 1000) {
      # The paired ends look like A/B...B.  Make sure we take A and B.
	    # The paired ends look like A/B...B. Make sure we take A and B
	    if (read[1] == read[2]) {
	      # take the unique one "B" for sure
	      read2 = 3;
	      # take the end of "A/B" that isn't close to "B"
	      read1 = dist[13] > dist[23] ? 1:2;
	    }
      else if (read[1] == read[3]) {
	    else if (read[1] == read[3]) { # but this shouldn't happen
	      read2 = 2;
	      read1 = dist[12] > dist[23] ? 1:3;
	      print "This shouldn't happen",name[read2] > "neva_err"
	    }
	    else if (read[2] == read[3]) {
	      read2 = 1;
@@ -401,113 +511,42 @@ END{
	      printf("reads strange\n") > "/dev/stderr"
	      exit 1
	    }
      
      if (mapped[read1] && mapped[read2]) {
	count_norm++;
	if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
	  print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1;
	}
	else {
	  print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1;
	}
      }
      else {
	for (i in c) {
	  print c[i] > fname3;
	}	
	count_unmapped++;
      }
	    normal = 1;
	  }
	  else {
	    # chimeric read with the 3 ends > 1KB apart
	    count_abnorm++;
	    for (i in c) {
	print c[i] > fname2;
      }
    }
  }
  else if (count > 3) {
    # chimeric read > 3, too many to deal with
    count_abnorm++;
    for (i in c) {
      print c[i] > fname2;
    }
  }
  else if (count == 2) {
    # code here should be same as above, but it's a "normal" read
    j = 0;
    for (i in c) {
      split(c[i], tmp);
      split(tmp[1],readname,"/");
      str[j] = and(tmp[2],16);
      chr[j] = tmp[3];
      pos[j] = tmp[4];
      m[j] = tmp[5];
      cigarstr[j] = tmp[6];
      seq[j] = tmp[10];
      #qual[j] = tmp[11];
      name[j] = tmp[1];    
      
      if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
	split(tmp[6], cigar, "S");
	pos[j] = pos[j] - cigar[1];
	if (pos[j] <= 0) {
	  pos[j] = 1;
	}
      }
      else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) {
	split(tmp[6], cigar, "H");
	pos[j] = pos[j] - cigar[1];
	if (pos[j] <= 0) {
	  pos[j] = 1;
	}
      }
      else if (str[j] == 16) {
	# count Ms,Ds,Ns,Xs,=s for sequence length 
	seqlength=0; 
	currstr=tmp[6];
	where=match(currstr, /[0-9]+[M|D|N|X|=]/); 
	while (where>0) {
	  seqlength+=substr(currstr,where,RLENGTH-1)+0;
	  currstr=substr(currstr,where+RLENGTH);
	  where=match(currstr, /[0-9]+[M|D|N|X|=]/);
	}
	pos[j] = pos[j] + seqlength - 1;
	if (tmp[6] ~ /[0-9]+S$/) {
	  where = match(tmp[6],/[0-9]+S$/);
	  cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
	  pos[j] = pos[j] + cigloc;
	      print c[i] > abnorm;
	    }
	if (tmp[6] ~ /[0-9]+H$/) {
	  where = match(tmp[6],/[0-9]+H$/);
	  cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
	  pos[j] = pos[j] + cigloc;
	  }
	if (chr[j] ~ /MT/ && pos[j] >= 16569) {
	  pos[j] = pos[j] - 16569;
	}
      }
      mapped[j] = and(tmp[2],4) == 0;
      j++;
    }
    if (mapped[0] && mapped[1]) {
	} # sum_mapped == 3
	if (normal) {
	  if (sum_mapped == 2 && count==2) {
	    count_reg++;
      if (less_than(str[0],chr[0],pos[0],str[1],chr[1],pos[1])) {
	# ideally we'll get rid of printing out cigar string at some point
	# qual
	print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1;
	  }
      else {
	print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1;
	  else count_norm++;
	  # or do we want to print the whole read group including chimeric?
	  #print c[read1] >> alignable;
	  #linenum++;
	  #print c[read2] >> alignable;
	  #linenum++;
	  prevlinenum=linenum;
	  for (j=1; j <= count; j++) {
	    print c[j] >> alignable;
	    linenum++;
	  }

	  if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
	    print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1]"$"fname"$"(prevlinenum+1),name[read2]"$"fname"$"linenum > norm;
	  }
	  else {
      for (i in c) {
	print c[i] > fname3;
      }	
      count_unmapped++;
	    print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2]"$"fname"$"(prevlinenum+1),name[read1]"$"fname"$"linenum > norm;
	  }
	}
  resfile=fname1".res.txt";
  printf("%d %d %d %d %d\n", tottot, count_unmapped, count_reg, count_norm, count_abnorm) >> resfile;
      } # else (mapq above 0) 
    } # else (sum_mapped >= 2)

    resfile=norm".res.txt";
    printf("%d %d %d %d %d %d\n", tottot, count_unmapped, count_reg, count_norm, count_abnorm, count_mapq0) >> resfile;
}
+2 −1
Original line number Diff line number Diff line
@@ -31,7 +31,8 @@ a3+=$4; # chimeric paired
a4+=$5; # chimeric ambiguous
a5+=$6; # unmapped
a6+=$1; # ligations 
a7+=$7; # mapq0
}
END{
    printf("Sequenced Read Pairs:  %'d\n Normal Paired: %'d (%0.2f%)\n Chimeric Paired: %'d (%0.2f%)\n Chimeric Ambiguous: %'d (%0.2f%)\n Unmapped: %'d (%0.2f%)\n Ligation Motif Present: %'d (%0.2f%)\nAlignable (Normal+Chimeric Paired): %'d (%0.2f%)\n", a1, a3, a3*100/a1, a4, a4*100/a1, a5, a5*100/a1, a2, a2*100/a1, a6, a6*100/a1, a3+a4, (a3+a4)*100/a1);
    printf("Sequenced Read Pairs:  %'d\n Normal Paired: %'d (%0.2f%)\n Chimeric Paired: %'d (%0.2f%)\n Chimeric Ambiguous: %'d (%0.2f%)\n Unmapped: %'d (%0.2f%)\n MAPQ 0: %'d (%0.2f%)\n Ligation Motif Present: %'d (%0.2f%)\nAlignable (Normal+Chimeric Paired): %'d (%0.2f%)\n", a1, a3, a3*100/a1, a4, a4*100/a1, a5, a5*100/a1, a2, a2*100/a1, a7, a7*100/a1, a6, a6*100/a1, a3+a4, (a3+a4)*100/a1);
}