Commit 0ecc10e1 authored by nchernia's avatar nchernia
Browse files

Change log

Major:
- BWA now aligns in paired end mode. This requires BWA version 0.7.17 or higher; short read and short end mode
are now deprecated

Minor:
- The chimera handling script now includes the header and prints out tab-delimited, for better conversion to BAM; it also no longer looks for the /1, /2 but rather looks for the SAM flag
- We dedup collisions now
- An addition to the dups script makes it run faster and with less memory when there are a lot of duplicates
- Statistics updated in CPU to properly handle multiple ligations; also added scripts in CPU that were missing for mega
- Made the names correct in the stats_sub script
- Count ligations explicitly excludes the readname in the fastq file
parent cab7978b
Loading
Loading
Loading
Loading
+62 −5
Original line number Diff line number Diff line
@@ -74,7 +74,14 @@ BEGIN{
  OFS="\t";
  tottot = -1; # will count first non-group
}
{
$0 ~ /^@/{
  # print SAM header to SAM files
     header = header""$0"\n";
}
$0 !~ /^@/{
  if (tottot == -1) {
     header = header"@PG\tID:Juicer\tVN:1.6";
  }
  # input file is sorted by read name.  Look at read name to group 
  # appropriately
  split($1,a,"/");
@@ -91,7 +98,15 @@ BEGIN{
      for (j=1; j <= count; j++) {
	split(c[j], tmp);
	split(tmp[1],readname,"/");
	# backwards compatibility
	# NB: we only care if the read ends match, not the exact number
	if (length(readname)>1) {
	    read[j] = readname[2];
	}
	else {
	    # first in pair: 64
	    read[j] = (and(tmp[2], 64) > 0);
	}
	name[j] = tmp[1];
	
	# strand; Bit 16 set means reverse strand
@@ -185,6 +200,9 @@ BEGIN{
	    }
	  }
	  else {
	    if (count_unmapped == -1) {
	      print header > fname3;
	    } 
	    for (i in c) {
	      print c[i] > fname3;
	    }	
@@ -193,10 +211,13 @@ BEGIN{
	}	
	else { 
	  # chimeric read with the 4 ends > 1KB apart
	  count_abnorm++;
	  if (count_abnorm == -1) {
	    print header > fname2;
	  }
	  for (i in c) {
	    print c[i] > fname2;
	  }
	  count_abnorm++;
	}
      }
      else {
@@ -237,6 +258,9 @@ BEGIN{
	    }
	  }
	  else {
	    if (count_unmapped == -1) {
	      print header > fname3;
	    }
	    for (i in c) {
	      print c[i] > fname3;
	    }	
@@ -245,19 +269,25 @@ BEGIN{
	}
	else {
	  # chimeric read with the 3 ends > 1KB apart
	  count_abnorm++;
	  if (count_abnorm == -1) {
	    print header > fname2;
	  }
	  for (i in c) {
	    print c[i] > fname2;
	  }
	  count_abnorm++;
	}
      }
    }
    else if (count > 3) {
      # chimeric read > 3, too many to deal with
      count_abnorm++;
      if (count_abnorm == -1) {
	print header > fname2;
      }
      for (i in c) {
	print c[i] > fname2;
      }
      count_abnorm++;
    }
    else if (count == 2) {
      # code here should be same as above, but it's a "normal" read
@@ -328,6 +358,9 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
	}
      }
      else {
	if (count_unmapped == -1) {
          print header > fname3;
	}
	for (i in c) {
	  print c[i] > fname3;
	}	
@@ -336,6 +369,9 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
    }
    else if (count == 1) {
      # this actually shouldn't happen, but it happens with alternate aligners on occasion
      if (count_abnorm == -1) {
	print header > fname2;
      }
      count_abnorm++;
      for (i in c) {
	print c[i] > fname2;
@@ -451,6 +487,9 @@ END{
	    }
	  }
	  else {
	    if (count_unmapped == -1) {
	      print header > fname3;
	    }
	    for (i in c) {
	      print c[i] > fname3;
	    }	
@@ -459,6 +498,9 @@ END{
	}	
	else { 
	  # chimeric read with the 4 ends > 1KB apart
          if (count_abnorm == -1) {
	    print header > fname2;
          }
	  count_abnorm++;
	  for (i in c) {
	    print c[i] > fname2;
@@ -503,6 +545,9 @@ END{
	    }
	  }
	  else {
	    if (count_unmapped == -1) {
	      print header > fname3;
	    }
	    for (i in c) {
	      print c[i] > fname3;
	    }	
@@ -511,6 +556,9 @@ END{
	}
	else {
	  # chimeric read with the 3 ends > 1KB apart
          if (count_abnorm == -1) {
	    print header > fname2;
          }
	  count_abnorm++;
	  for (i in c) {
	    print c[i] > fname2;
@@ -520,6 +568,9 @@ END{
    }
    else if (count > 3) {
      # chimeric read > 3, too many to deal with
      if (count_abnorm == -1) {
	print header > fname2;
      }
      count_abnorm++;
      for (i in c) {
	print c[i] > fname2;
@@ -594,6 +645,9 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
	}
      }
      else {
	if (count_unmapped == -1) {
          print header > fname3;
	}
	for (i in c) {
	  print c[i] > fname3;
	}	
@@ -602,6 +656,9 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
    }
    else if (count == 1) {
      # this actually shouldn't happen, but it happens with alternate aligners on occasion
      if (count_abnorm == -1) {
	print header > fname2;
      }
      count_abnorm++;
      for (i in c) {
	print c[i] > fname2;
+8 −4
Original line number Diff line number Diff line
@@ -31,10 +31,14 @@ if [ $total -eq $total2 ]
then 
    rm aligned/merged_sort.txt 
    rm -r splits 
    testname=$(ls -l fastq | awk 'NR==1{print $9}')
    if [ "${testname: -5}" == ".fastq" ]
    then
	for i in fastq/*.fastq
	do
            gzip $i
	done
    fi
    gzip aligned/merged_nodups.txt
    gzip aligned/dups.txt
    gzip aligned/opt_dups.txt
+42 −0
Original line number Diff line number Diff line
# Rearrange columns
# Two-pass algorithm; could have made it one pass but the behavior would
# have been confusing
# First find unique chromosomes represented in file and give them an index 
# number
# Then multiple chromosome index by 1B and add it to position; sort based
# on this number
BEGIN {
  while ((getline < fname) > 0){
    for (i=3; i<=NF; i+=7){
      chrs[$i]
    }
  }
  ind=1;
  for (i in chrs) {
    a[ind++]=i;
  }
  delete chrs;
  asort(a,b,"@val_num_asc");
  for (i=1; i<=length(b); i++) {
    chrs[b[i]]=i;
  }
  delete a;
  delete b;
}
{
  str=""; 
  for (i=3; i<=NF; i+=7){
    # chr index acts as a hash; same chromosome sorted by position;
    # different chromosome sorted by chromosome index
    num=chrs[$i]*1000000000+$(i+1); 
    str=str" "num;
  }
  split(str,a);
  n=asorti(a,b,"@val_num_asc");  
  # here is where we rearrange the columns according to sort position
  for (j=1; j<=n; j++){ 
    start=((b[j]-1)*7 + 1); 
    printf("%s %s %s %s %s %s %s ",$start,$(start+1),$(start+2),$(start+3),$(start+4),$(start+5),$(start+6));
  } 
  printf("\n");
}
+123 −0
Original line number Diff line number Diff line
# Returns absolute value of v
function abs(v) {
  return v<0?-v:v;
}
# Examines locs to see if they are within wobble
# locs should be pairs of positions to examine
# If all are within wobble limit, they are "tooclose" and are dups
function tooclose(locs) {
  cls=1;
  # abs function checks that distance is within wobble
  # AND function will require all distances to be within wobble
  for (i=1; i<length(locs); i+=2) {
    cls=and(cls,(abs(locs[i]-locs[i+1]) <= wobble));
  }
  return cls;
}
BEGIN{
  size=0;
  wobble=4;
  dupname=name"collisions_dups.txt";
  nodupname=name"collisions_nodups.txt";
  prevNF=0;
}
NF != 0{
  if (NF != prevNF) {
    # not a duplicate; handle dups array
    handledupsarray=1;
  }
  else {
    split(prev, p);
    test=1;
    # this code is testing that strand and chromosomes match 
    # NEVA: make sure we are sorting by strand as well .... prob. in GAWK as well as sort
    for (i=2; i<=NF; i=i+7) {
      test=and(test, ($i == p[i]));
      test=and(test, ($(i+1) == p[i+1]));
    }
    # this looks at the first position being within wobble
    test=and(test, (abs($4-p[4])<= wobble));
    handledupsarray=!test;
  }

  if (!handledupsarray) {
    line[size]=$0;
    size++;
  }
  else {
    if (size >= 2) {
      for (j=0; j<size; j++) {
	split(line[j], check1);
	if (!(j in dups)) {
	  for (k=j+1; k<size; k++) {
	    split(line[k], check2);
	    ind=1;
	    # set up locs array to check distance
	    for (x=4; x<=length(check1); x=x+7) {
	      locs[ind]=check1[x];
	      locs[ind+1]=check2[x];
	      ind=ind+2;
	    }
	    if (tooclose(locs)) {
	      dups[k]++; #places a 1 at dups[k]
	    }
	    delete locs;
	  }
	}
      }
      # print dups out to dup file, non-dups out to non-dup file
      for (j=0; j<size; j++) {
	if (j in dups) {
	  print line[j] > dupname
	}
	else {
	  print line[j] > nodupname
	}
      }
    }
    else { # size=1
      print line[0] > nodupname;
    }
    delete line;
    delete dups;
    size=1;
    line[0]=$0;
  }
  prev=$0;
  prevNF=NF;
}
END {
  if (size >= 2) {
    for (j=0; j<size; j++) {
      split(line[j], check1);
      if (!(j in dups)) {
	for (k=j+1; k<size; k++) {
	  split(line[k], check2);
	  ind=1;
	  # set up locs array to check distance
	  for (x=4; x<=length(check1); x=x+7) {
	    locs[ind]=check1[x];
	    locs[ind+1]=check2[x];
	    ind=ind+2;
	  }
	  if (tooclose(locs)) {
	    dups[k]++; #places a 1 at dups[k]
	  }
	  delete locs;
	}
      }
    }
    # print dups out to dup file, non-dups out to non-dup file
    for (j=0; j<size; j++) {
      if (j in dups) {
	print line[j] > dupname
      }
      else {
	print line[j] > nodupname
      }
    }
  }
  else { # size=1
    print line[0] > nodupname;
  }
}
 No newline at end of file
+4 −2
Original line number Diff line number Diff line
@@ -25,12 +25,14 @@
#
# Small helper script to count reads with ligation junction
# Juicer version 1.5
export LC_ALL=C
export LC_COLLATE=C
if [ "$usegzip" -eq 1 ]
then 
    num1=$(paste <(gunzip -c $name1$ext) <(gunzip -c $name2$ext) | grep -cE $ligation)
    num1=$(paste <(gunzip -c $name1$ext) <(gunzip -c $name2$ext) | awk '!((NR+2)%4)' | grep -cE $ligation)
    num2=$(gunzip -c ${name1}${ext} | wc -l | awk '{print $1}')
else
    num1=$(paste $name1$ext $name2$ext | grep -cE $ligation)
    num1=$(paste $name1$ext $name2$ext | awk '!((NR+2)%4)' | grep -cE $ligation)
    num2=$(wc -l ${name1}${ext} | awk '{print $1}')
fi
echo -ne "$num1 " > ${name}${ext}_norm.txt.res.txt
Loading