mostly working version of 1.6 (f0317a2f) · Commits · Chaos / chaos.juicer

CPU/common/check.sh

+1 −1

Original line number	Diff line number	Diff line
		@@ -29,7 +29,7 @@

		# Start by checking the statistics to see if they add up
		res1=`cat ${splitdir}/normres*`
		check1=`cat ${splitdir}/normres* \| awk '{s2+=$2; s3+=$3; s4+=$4; s5+=$5; s6+=$6}END{if (s2 != s3+s4+s5+s6){print 0}else{print 1}}'`
		check1=`cat ${splitdir}/normres* \| awk '{s2+=$2; s3+=$3; s4+=$4; s5+=$5; s6+=$6; s7+=$7}END{if (s2 != s3+s4+s5+s6+s7){print 0}else{print 1}}'`
		if [ $check1 -eq 0 ] \|\| [ -z "$res1" ]
		then
		echo "***! Error! The statistics do not add up. Alignment likely failed to complete on one or more files. Run relaunch_prep.sh"

CPU/common/chimeric_blacklist.awk

+435 −396

Original line number	Diff line number	Diff line
		@@ -23,24 +23,24 @@
		# THE SOFTWARE.
		##########
		# Takes a SAM file, looks for chimeric reads and normal reads. Outputs
		# only those reads mapped to chromosomes 1-24 with MAPQ > 0.
		# only those reads mapped with MAPQ > 0
		#
		# Output to fname1 is of the form:
		# strand1 chromosome1 position1 frag1 strand2 chromosome2 position2 frag2
		# mapq1 cigar1 seq1 mapq2 cigar2 seq2
		# where chr1 is always <= chr2
		#
		# Output to fname2 retains SAM format.
		# SAMs are also outputted.
		#
		# Chimeric reads are treated as follows:
		# "Normal" chimeras (with 3 reads, two with the ligation junction), where the
		# third read (w/o ligation junction) is within 20Kbp of one of the ligation
		# junction reads, are sent to fname1. All other chimeras (where either there
		# junction reads, are sent to fname"_norm". All other chimeras (where either there
		# are more than 3 reads, or they don't contain the ligation junction, or the
		# one non-ligation junction end is far from the others, are sent to fname2.
		# one non-ligation junction end is far from the others, are sent to fname"_abnorm".
		#
		# awk -f chimeric_blacklist.awk -v fname1="norm_chimera" fname2="abnorm_chimera" fname3="unmapped"
		# Juicer version 1.5
		# awk -f chimeric_blacklist.awk -v fname="stem"
		# Juicer version 1.6

		# returns absolute value
		function abs(value)
		@@ -73,11 +73,35 @@ function less_than(s1,c1,p1,s2,c2,p2)
		BEGIN{
		OFS="\t";
		tottot = -1; # will count first non-group
		count_unmapped = -1; # will count first non-group
		norm=fname"_norm.txt"
		alignable=fname"_alignable.sam"
		abnorm=fname"_abnorm.sam"
		unmapped=fname"_unmapped.sam"
		mapq0=fname"_mapq0.sam"
		linenum=0
		}
		$0 ~ /^@/{
		# print SAM header to SAM files
		print > alignable;
		linenum++;
		print > abnorm;
		print > unmapped;
		print > mapq0;
		}
		$0 !~ /^@/{
		if (tottot == -1) {
		print "@PG", "ID:Juicer", "VN:1.6" > alignable;
		linenum++;
		print "@PG", "ID:Juicer", "VN:1.6" > abnorm;
		print "@PG", "ID:Juicer", "VN:1.6" > unmapped;
		print "@PG", "ID:Juicer", "VN:1.6" > mapq0;
		}
		{
		# input file is sorted by read name. Look at read name to group
		# appropriately
		# we don't need this with paired end but might as well keep it
		split($1,a,"/");

		if(a[1]==prev){
		# move on to next record. look below this block for actions that occur
		# regardless.
		@@ -86,12 +110,55 @@ BEGIN{
		else {
		# deal with read pair group
		tottot++;
		if (count==3) {
		# chimeric read
		for (j=1; j <= 3; j++) {
		normal=0; # assume not normal, will get set if normal
		# first take care of unmapped reads
		# blacklist - if 3rd bit set (=4) it means unmapped
		sum_mapped = 0;
		for (j=1; j <= count; j++) {
		split(c[j],tmp);
		mapped[j] = and(tmp[2],4) == 0;
		sum_mapped = sum_mapped+mapped[j];
		}
		if (sum_mapped < 2) {
		# unmapped
		for (j=1; j <= count; j++) {
		print c[j] >> unmapped;
		}
		# otherwise first time through will count extra unmapped read
		count_unmapped++;
		}
		else {
		# find minimum mapq; if it's 0, send to mapq0 file
		minmapq=100;
		for (j=1; j <= count; j++) {
		split(c[j],tmp);
		split(tmp[1],readname,"/");
		read[j] = readname[2];
		if (tmp[5] < minmapq) {
		minmapq=tmp[5];
		}
		}
		if (minmapq == 0) {
		count_mapq0++;
		for (j=1; j <= count; j++) {
		print c[j] >> mapq0;
		}
		}
		else if ((count > 4) \|\| (count == 1)) {
		# count==1 shouldn't happen; occurs with alternate aligners at times
		for (j=1; j <= count; j++) {
		print c[j] >> abnorm;
		}
		}
		else {
		# count is 2,3,4.
		# 2 will be normal paired
		# 3 will be chimeric paired or abnormal
		# (chimeric pair, depends on where the ends are)
		# 4 will be chimeric paired or abnormal
		# (chimeric pair, depends on where the ends are)
		for (j=1; j <= count; j++) {
		split(c[j],tmp);
		# first in pair: 64
		read[j] = (and(tmp[2], 64) > 0);
		name[j] = tmp[1];

		# strand; Bit 16 set means reverse strand
		@@ -106,7 +173,6 @@ BEGIN{
		cigarstr[j] = tmp[6];
		# sequence
		seq[j] = tmp[10];
		#qual[j] = tmp[11];

		# get rid of soft clipping to know correct position
		if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
		@@ -151,26 +217,67 @@ BEGIN{
		if (chr[j] ~ /MT/ && pos[j] >= 16569) {
		pos[j] = pos[j] - 16569;
		}
		} # else if str[j]==16
		} # end for loop, now have all info about each read end

		if (sum_mapped == 2) {
		# take these as the reads
		read1=0;
		read2=0;
		for (j=1; j <= count; j++) {
		if (mapped[j] && !read1) read1=j;
		else if (mapped[j] && read1) read2=j;
		}
		normal=1;
		}
		else if (sum_mapped == 4) {
		# looking for A/B...A/B
		# will always be first in pair, second in pair
		if (read[1] == read[2] && read[3] == read[4] && read[1] != read[3]) {
		dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]);
		dist[24] = abs(chr[2]-chr[4])*10000000 + abs(pos[2]-pos[4]);
		dist[14] = abs(chr[1]-chr[4])*10000000 + abs(pos[1]-pos[4]);
		dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]);

		# A/B, A/B
		if ((dist[13] < 1000 && dist[24] < 1000) \|\| (dist[14] < 1000 && dist[23] < 1000)) {
		read1 = 1;
		read2 = 2;
		normal = 1;
		}
		# blacklist - if 3rd bit set (=4) it means unmapped
		mapped[j] = and(tmp[2],4) == 0;
		else {
		# abnormal
		for (j=1; j <= count; j++) {
		print c[j] >> abnorm;
		}

		count_abnorm++;
		}
		}
		else { # count 4, not close together
		# abnormal
		for (j=1; j <= count; j++) {
		print c[j] >> abnorm;
		}
		count_abnorm++;
		}
		} # if sum_mapped == 4
		else { # sum_mapped == 3
		dist[12] = abs(chr[1]-chr[2])*10000000 + abs(pos[1]-pos[2]);
		dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]);
		dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]);

		if (min(dist[12],min(dist[23],dist[13])) < 1000) {
		# The paired ends look like A/B...B. Make sure we take A and B.
		# The paired ends look like A/B...B. Make sure we take A and B
		if (read[1] == read[2]) {
		# take the unique one "B" for sure
		read2 = 3;
		# take the end of "A/B" that isn't close to "B"
		read1 = dist[13] > dist[23] ? 1:2;
		}
		else if (read[1] == read[3]) {
		else if (read[1] == read[3]) { # but this shouldn't happen
		read2 = 2;
		read1 = dist[12] > dist[23] ? 1:3;
		print "This shouldn't happen",name[read2] > "neva_err"
		}
		else if (read[2] == read[3]) {
		read2 = 1;
		@@ -180,127 +287,44 @@ BEGIN{
		printf("reads strange\n") > "/dev/stderr"
		exit 1
		}

		if (mapped[read1] && mapped[read2]) {
		count_norm++;
		if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
		# print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2],qual[read1],qual[read2] > fname1;
		print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1;
		}
		else {
		# print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1],qual[read2],qual[read1] > fname1;
		print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1;
		}
		}
		else {
		for (i in c) {
		print c[i] > fname3;
		}
		count_unmapped++;
		}
		normal = 1;
		}
		else {
		# chimeric read with the 3 ends > 1KB apart
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;
		}
		}
		}
		else if (count > 3) {
		# chimeric read > 3, too many to deal with
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;
		}
		}
		else if (count == 2) {
		# code here should be same as above, but it's a "normal" read
		j = 0;
		for (i in c) {
		split(c[i], tmp);
		split(tmp[1],readname,"/");
		str[j] = and(tmp[2],16);
		chr[j] = tmp[3];
		pos[j] = tmp[4];
		m[j] = tmp[5];
		cigarstr[j] = tmp[6];
		seq[j] = tmp[10];
		#qual[j] = tmp[11];
		name[j] = tmp[1];

		# blacklist - if 3rd bit set (=4) it means unmapped
		mapped[j] = and(tmp[2],4) == 0;

		if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
		split(tmp[6], cigar, "S");
		pos[j] = pos[j] - cigar[1];
		if (pos[j] <= 0) {
		pos[j] = 1;
		}
		}
		else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) {
		split(tmp[6], cigar, "H");
		pos[j] = pos[j] - cigar[1];
		if (pos[j] <= 0) {
		pos[j] = 1;
		}
		}
		else if (str[j] == 16) {
		# count Ms,Ds,Ns,Xs,=s for sequence length
		seqlength=0;
		currstr=tmp[6];
		where=match(currstr, /[0-9]+[M\|D\|N\|X\|=]/);
		while (where>0) {
		seqlength+=substr(currstr,where,RLENGTH-1)+0;
		currstr=substr(currstr,where+RLENGTH);
		where=match(currstr, /[0-9]+[M\|D\|N\|X\|=]/);
		}
		pos[j] = pos[j] + seqlength - 1;
		if (tmp[6] ~ /[0-9]+S$/) {
		where = match(tmp[6],/[0-9]+S$/);
		cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
		pos[j] = pos[j] + cigloc;
		print c[i] > abnorm;
		}
		if (tmp[6] ~ /[0-9]+H$/) {
		where = match(tmp[6],/[0-9]+H$/);
		cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
		pos[j] = pos[j] + cigloc;
		}
		if (chr[j] ~ /MT/ && pos[j] >= 16569) {
		pos[j] = pos[j] - 16569;
		}
		}
		j++;
		}
		if (mapped[0] && mapped[1]) {
		} # sum_mapped == 3
		if (normal) {
		if (sum_mapped == 2 && count==2) {
		count_reg++;
		if (less_than(str[0],chr[0],pos[0],str[1],chr[1],pos[1])) {
		# ideally we'll get rid of printing out cigar string at some point
		#print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1],qual[0],qual[1] > fname1;
		print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1;
		}
		else {
		#print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0],qual[1],qual[0] > fname1;
		print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1;
		else count_norm++;
		# or do we want to print the whole read group including chimeric?
		#print c[read1] >> alignable;
		#linenum++;
		#print c[read2] >> alignable;
		#linenum++;
		prevlinenum=linenum;
		for (j=1; j <= count; j++) {
		print c[j] >> alignable;
		linenum++;
		}

		if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
		print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1]"$"fname"$"(prevlinenum+1),name[read2]"$"fname"$"linenum > norm;
		}
		else {
		for (i in c) {
		print c[i] > fname3;
		}
		count_unmapped++;
		}
		}
		else if (count == 1) {
		# this actually shouldn't happen, but it happens with alternate aligners on occasion
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;
		print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2]"$"fname"$"(prevlinenum+1),name[read1]"$"fname"$"linenum > norm;
		}
		}
		} # else (mapq above 0)
		} # else (sum_mapped >= 2)
		# reset variables
		delete c;
		delete tmp;
		count=1;
		prev=a[1];
		}
		@@ -310,15 +334,58 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
		END{
		# deal with read pair group
		tottot++;
		if (count==3) {
		# chimeric read
		for (j=1; j <= 3; j++) {
		normal=0; # assume not normal, will get set if normal
		# first take care of unmapped reads
		# blacklist - if 3rd bit set (=4) it means unmapped
		sum_mapped = 0;
		for (j=1; j <= count; j++) {
		split(c[j],tmp);
		mapped[j] = and(tmp[2],4) == 0;
		sum_mapped = sum_mapped+mapped[j];
		}
		if (sum_mapped < 2) {
		# unmapped
		for (j=1; j <= count; j++) {
		print c[j] >> unmapped;
		}
		count_unmapped++;
		}
		else {
		# find minimum mapq; if it's 0, send to mapq0 file
		minmapq=100;
		for (j=1; j <= count; j++) {
		split(c[j],tmp);
		split(tmp[1],readname,"/");
		read[j] = readname[2];
		if (tmp[5] < minmapq) {
		minmapq=tmp[5];
		}
		}
		if (minmapq == 0) {
		count_mapq0++;
		for (j=1; j <= count; j++) {
		print c[j] >> mapq0;
		}
		}
		else if ((count > 4) \|\| (count == 1)) {
		# count==1 shouldn't happen; occurs with alternate aligners at times
		for (j=1; j <= count; j++) {
		print c[j] >> abnorm;
		}
		}
		else {
		# count is 2,3,4.
		# 2 will be normal paired
		# 3 will be chimeric paired or abnormal
		# (chimeric pair, depends on where the ends are)
		# 4 will be chimeric paired or abnormal
		# (chimeric pair, depends on where the ends are)
		sum_mapped=0;
		for (j=1; j <= count; j++) {
		split(c[j], tmp);
		# first in pair: 64
		read[j] = (and(tmp[2], 64) > 0);
		name[j] = tmp[1];

		# strand
		# strand; Bit 16 set means reverse strand
		str[j] = and(tmp[2],16);
		# chromosome
		chr[j] = tmp[3];
		@@ -330,7 +397,7 @@ END{
		cigarstr[j] = tmp[6];
		# sequence
		seq[j] = tmp[10];
		#qual[j] = tmp[11];

		# get rid of soft clipping to know correct position
		if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
		split(tmp[6], cigar, "S");
		@@ -339,6 +406,7 @@ END{
		pos[j] = 1;
		}
		}
		# get rid of hard clipping to know correct position
		else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) {
		split(tmp[6], cigar, "H");
		pos[j] = pos[j] - cigar[1];
		@@ -364,7 +432,7 @@ END{
		cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
		pos[j] = pos[j] + cigloc;
		}
		if (tmp[6] ~ /[0-9]+H$/) {
		else if (tmp[6] ~ /[0-9]+H$/) {
		where = match(tmp[6],/[0-9]+H$/);
		cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
		pos[j] = pos[j] + cigloc;
		@@ -373,25 +441,67 @@ END{
		if (chr[j] ~ /MT/ && pos[j] >= 16569) {
		pos[j] = pos[j] - 16569;
		}
		} # else if str[j]==16
		} # end for loop, now have all info about each read end

		if (sum_mapped == 2) {
		# take these as the reads
		read1=0;
		read2=0;
		for (j=1; j <= count; j++) {
		if (mapped[j] && !read1) read1=j;
		else if (mapped[j] && read1) read2=j;
		}
		normal=1;
		}
		else if (sum_mapped == 4) {
		# looking for A/B...A/B
		# will always be first in pair, second in pair
		if (read[1] == read[2] && read[3] == read[4] && read[1] != read[3]) {
		dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]);
		dist[24] = abs(chr[2]-chr[4])*10000000 + abs(pos[2]-pos[4]);
		dist[14] = abs(chr[1]-chr[4])*10000000 + abs(pos[1]-pos[4]);
		dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]);

		# A/B, A/B
		if ((dist[13] < 1000 && dist[24] < 1000) \|\| (dist[14] < 1000 && dist[23] < 1000)) {
		read1 = 1;
		read2 = 2;
		normal = 1;
		}
		else {
		# abnormal
		for (j=1; j <= count; j++) {
		print c[j] >> abnorm;
		}
		mapped[j] = and(tmp[2],4) == 0;
		count_abnorm++;
		}

		}
		else { # count 4, not close together
		# abnormal
		for (j=1; j <= count; j++) {
		print c[j] >> abnorm;
		}
		count_abnorm++;
		}
		} # if sum_mapped == 4
		else { # sum_mapped == 3
		dist[12] = abs(chr[1]-chr[2])*10000000 + abs(pos[1]-pos[2]);
		dist[23] = abs(chr[2]-chr[3])*10000000 + abs(pos[2]-pos[3]);
		dist[13] = abs(chr[1]-chr[3])*10000000 + abs(pos[1]-pos[3]);

		if (min(dist[12],min(dist[23],dist[13])) < 1000) {
		# The paired ends look like A/B...B. Make sure we take A and B.
		# The paired ends look like A/B...B. Make sure we take A and B
		if (read[1] == read[2]) {
		# take the unique one "B" for sure
		read2 = 3;
		# take the end of "A/B" that isn't close to "B"
		read1 = dist[13] > dist[23] ? 1:2;
		}
		else if (read[1] == read[3]) {
		else if (read[1] == read[3]) { # but this shouldn't happen
		read2 = 2;
		read1 = dist[12] > dist[23] ? 1:3;
		print "This shouldn't happen",name[read2] > "neva_err"
		}
		else if (read[2] == read[3]) {
		read2 = 1;
		@@ -401,113 +511,42 @@ END{
		printf("reads strange\n") > "/dev/stderr"
		exit 1
		}

		if (mapped[read1] && mapped[read2]) {
		count_norm++;
		if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
		print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1;
		}
		else {
		print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1;
		}
		}
		else {
		for (i in c) {
		print c[i] > fname3;
		}
		count_unmapped++;
		}
		normal = 1;
		}
		else {
		# chimeric read with the 3 ends > 1KB apart
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;
		}
		}
		}
		else if (count > 3) {
		# chimeric read > 3, too many to deal with
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;
		}
		}
		else if (count == 2) {
		# code here should be same as above, but it's a "normal" read
		j = 0;
		for (i in c) {
		split(c[i], tmp);
		split(tmp[1],readname,"/");
		str[j] = and(tmp[2],16);
		chr[j] = tmp[3];
		pos[j] = tmp[4];
		m[j] = tmp[5];
		cigarstr[j] = tmp[6];
		seq[j] = tmp[10];
		#qual[j] = tmp[11];
		name[j] = tmp[1];

		if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
		split(tmp[6], cigar, "S");
		pos[j] = pos[j] - cigar[1];
		if (pos[j] <= 0) {
		pos[j] = 1;
		}
		}
		else if (str[j] == 0 && tmp[6] ~/^[0-9]+H/) {
		split(tmp[6], cigar, "H");
		pos[j] = pos[j] - cigar[1];
		if (pos[j] <= 0) {
		pos[j] = 1;
		}
		}
		else if (str[j] == 16) {
		# count Ms,Ds,Ns,Xs,=s for sequence length
		seqlength=0;
		currstr=tmp[6];
		where=match(currstr, /[0-9]+[M\|D\|N\|X\|=]/);
		while (where>0) {
		seqlength+=substr(currstr,where,RLENGTH-1)+0;
		currstr=substr(currstr,where+RLENGTH);
		where=match(currstr, /[0-9]+[M\|D\|N\|X\|=]/);
		}
		pos[j] = pos[j] + seqlength - 1;
		if (tmp[6] ~ /[0-9]+S$/) {
		where = match(tmp[6],/[0-9]+S$/);
		cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
		pos[j] = pos[j] + cigloc;
		print c[i] > abnorm;
		}
		if (tmp[6] ~ /[0-9]+H$/) {
		where = match(tmp[6],/[0-9]+H$/);
		cigloc = substr(tmp[6],where,RLENGTH-1) + 0;
		pos[j] = pos[j] + cigloc;
		}
		if (chr[j] ~ /MT/ && pos[j] >= 16569) {
		pos[j] = pos[j] - 16569;
		}
		}
		mapped[j] = and(tmp[2],4) == 0;
		j++;
		}
		if (mapped[0] && mapped[1]) {
		} # sum_mapped == 3
		if (normal) {
		if (sum_mapped == 2 && count==2) {
		count_reg++;
		if (less_than(str[0],chr[0],pos[0],str[1],chr[1],pos[1])) {
		# ideally we'll get rid of printing out cigar string at some point
		# qual
		print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1;
		}
		else {
		print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1;
		else count_norm++;
		# or do we want to print the whole read group including chimeric?
		#print c[read1] >> alignable;
		#linenum++;
		#print c[read2] >> alignable;
		#linenum++;
		prevlinenum=linenum;
		for (j=1; j <= count; j++) {
		print c[j] >> alignable;
		linenum++;
		}

		if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
		print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1]"$"fname"$"(prevlinenum+1),name[read2]"$"fname"$"linenum > norm;
		}
		else {
		for (i in c) {
		print c[i] > fname3;
		}
		count_unmapped++;
		print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2]"$"fname"$"(prevlinenum+1),name[read1]"$"fname"$"linenum > norm;
		}
		}
		resfile=fname1".res.txt";
		printf("%d %d %d %d %d\n", tottot, count_unmapped, count_reg, count_norm, count_abnorm) >> resfile;
		} # else (mapq above 0)
		} # else (sum_mapped >= 2)

		resfile=norm".res.txt";
		printf("%d %d %d %d %d %d\n", tottot, count_unmapped, count_reg, count_norm, count_abnorm, count_mapq0) >> resfile;
		}

CPU/common/stats_sub.awk

+2 −1

Original line number	Diff line number	Diff line
		@@ -31,7 +31,8 @@ a3+=$4; # chimeric paired
		a4+=$5; # chimeric ambiguous
		a5+=$6; # unmapped
		a6+=$1; # ligations
		a7+=$7; # mapq0
		}
		END{
		printf("Sequenced Read Pairs: %'d\n Normal Paired: %'d (%0.2f%)\n Chimeric Paired: %'d (%0.2f%)\n Chimeric Ambiguous: %'d (%0.2f%)\n Unmapped: %'d (%0.2f%)\n Ligation Motif Present: %'d (%0.2f%)\nAlignable (Normal+Chimeric Paired): %'d (%0.2f%)\n", a1, a3, a3100/a1, a4, a4100/a1, a5, a5100/a1, a2, a2100/a1, a6, a6100/a1, a3+a4, (a3+a4)100/a1);
		printf("Sequenced Read Pairs: %'d\n Normal Paired: %'d (%0.2f%)\n Chimeric Paired: %'d (%0.2f%)\n Chimeric Ambiguous: %'d (%0.2f%)\n Unmapped: %'d (%0.2f%)\n MAPQ 0: %'d (%0.2f%)\n Ligation Motif Present: %'d (%0.2f%)\nAlignable (Normal+Chimeric Paired): %'d (%0.2f%)\n", a1, a3, a3100/a1, a4, a4100/a1, a5, a5100/a1, a2, a2100/a1, a7, a7100/a1, a6, a6100/a1, a3+a4, (a3+a4)*100/a1);
		}

Admin message