Change log (0ecc10e1) · Commits · Chaos / chaos.juicer

CPU/common/chimeric_blacklist.awk

+62 −5

Original line number	Diff line number	Diff line
		@@ -74,7 +74,14 @@ BEGIN{
		OFS="\t";
		tottot = -1; # will count first non-group
		}
		{
		$0 ~ /^@/{
		# print SAM header to SAM files
		header = header""$0"\n";
		}
		$0 !~ /^@/{
		if (tottot == -1) {
		header = header"@PG\tID:Juicer\tVN:1.6";
		}
		# input file is sorted by read name. Look at read name to group
		# appropriately
		split($1,a,"/");
		@@ -91,7 +98,15 @@ BEGIN{
		for (j=1; j <= count; j++) {
		split(c[j], tmp);
		split(tmp[1],readname,"/");
		# backwards compatibility
		# NB: we only care if the read ends match, not the exact number
		if (length(readname)>1) {
		read[j] = readname[2];
		}
		else {
		# first in pair: 64
		read[j] = (and(tmp[2], 64) > 0);
		}
		name[j] = tmp[1];

		# strand; Bit 16 set means reverse strand
		@@ -185,6 +200,9 @@ BEGIN{
		}
		}
		else {
		if (count_unmapped == -1) {
		print header > fname3;
		}
		for (i in c) {
		print c[i] > fname3;
		}
		@@ -193,10 +211,13 @@ BEGIN{
		}
		else {
		# chimeric read with the 4 ends > 1KB apart
		count_abnorm++;
		if (count_abnorm == -1) {
		print header > fname2;
		}
		for (i in c) {
		print c[i] > fname2;
		}
		count_abnorm++;
		}
		}
		else {
		@@ -237,6 +258,9 @@ BEGIN{
		}
		}
		else {
		if (count_unmapped == -1) {
		print header > fname3;
		}
		for (i in c) {
		print c[i] > fname3;
		}
		@@ -245,19 +269,25 @@ BEGIN{
		}
		else {
		# chimeric read with the 3 ends > 1KB apart
		count_abnorm++;
		if (count_abnorm == -1) {
		print header > fname2;
		}
		for (i in c) {
		print c[i] > fname2;
		}
		count_abnorm++;
		}
		}
		}
		else if (count > 3) {
		# chimeric read > 3, too many to deal with
		count_abnorm++;
		if (count_abnorm == -1) {
		print header > fname2;
		}
		for (i in c) {
		print c[i] > fname2;
		}
		count_abnorm++;
		}
		else if (count == 2) {
		# code here should be same as above, but it's a "normal" read
		@@ -328,6 +358,9 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
		}
		}
		else {
		if (count_unmapped == -1) {
		print header > fname3;
		}
		for (i in c) {
		print c[i] > fname3;
		}
		@@ -336,6 +369,9 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
		}
		else if (count == 1) {
		# this actually shouldn't happen, but it happens with alternate aligners on occasion
		if (count_abnorm == -1) {
		print header > fname2;
		}
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;
		@@ -451,6 +487,9 @@ END{
		}
		}
		else {
		if (count_unmapped == -1) {
		print header > fname3;
		}
		for (i in c) {
		print c[i] > fname3;
		}
		@@ -459,6 +498,9 @@ END{
		}
		else {
		# chimeric read with the 4 ends > 1KB apart
		if (count_abnorm == -1) {
		print header > fname2;
		}
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;
		@@ -503,6 +545,9 @@ END{
		}
		}
		else {
		if (count_unmapped == -1) {
		print header > fname3;
		}
		for (i in c) {
		print c[i] > fname3;
		}
		@@ -511,6 +556,9 @@ END{
		}
		else {
		# chimeric read with the 3 ends > 1KB apart
		if (count_abnorm == -1) {
		print header > fname2;
		}
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;
		@@ -520,6 +568,9 @@ END{
		}
		else if (count > 3) {
		# chimeric read > 3, too many to deal with
		if (count_abnorm == -1) {
		print header > fname2;
		}
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;
		@@ -594,6 +645,9 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
		}
		}
		else {
		if (count_unmapped == -1) {
		print header > fname3;
		}
		for (i in c) {
		print c[i] > fname3;
		}
		@@ -602,6 +656,9 @@ print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cig
		}
		else if (count == 1) {
		# this actually shouldn't happen, but it happens with alternate aligners on occasion
		if (count_abnorm == -1) {
		print header > fname2;
		}
		count_abnorm++;
		for (i in c) {
		print c[i] > fname2;

CPU/common/cleanup.sh

+8 −4

Original line number	Diff line number	Diff line
		@@ -31,10 +31,14 @@ if [ $total -eq $total2 ]
		then
		rm aligned/merged_sort.txt
		rm -r splits
		testname=$(ls -l fastq \| awk 'NR==1{print $9}')
		if [ "${testname: -5}" == ".fastq" ]
		then
		for i in fastq/*.fastq
		do
		gzip $i
		done
		fi
		gzip aligned/merged_nodups.txt
		gzip aligned/dups.txt
		gzip aligned/opt_dups.txt

CPU/common/collisions_dedup_rearrange_cols.awk

0 → 100755

+42 −0

Original line number	Diff line number	Diff line
		# Rearrange columns
		# Two-pass algorithm; could have made it one pass but the behavior would
		# have been confusing
		# First find unique chromosomes represented in file and give them an index
		# number
		# Then multiple chromosome index by 1B and add it to position; sort based
		# on this number
		BEGIN {
		while ((getline < fname) > 0){
		for (i=3; i<=NF; i+=7){
		chrs[$i]
		}
		}
		ind=1;
		for (i in chrs) {
		a[ind++]=i;
		}
		delete chrs;
		asort(a,b,"@val_num_asc");
		for (i=1; i<=length(b); i++) {
		chrs[b[i]]=i;
		}
		delete a;
		delete b;
		}
		{
		str="";
		for (i=3; i<=NF; i+=7){
		# chr index acts as a hash; same chromosome sorted by position;
		# different chromosome sorted by chromosome index
		num=chrs[$i]*1000000000+$(i+1);
		str=str" "num;
		}
		split(str,a);
		n=asorti(a,b,"@val_num_asc");
		# here is where we rearrange the columns according to sort position
		for (j=1; j<=n; j++){
		start=((b[j]-1)*7 + 1);
		printf("%s %s %s %s %s %s %s ",$start,$(start+1),$(start+2),$(start+3),$(start+4),$(start+5),$(start+6));
		}
		printf("\n");
		}

CPU/common/collisions_dups.awk

0 → 100755

+123 −0

Original line number	Diff line number	Diff line
		# Returns absolute value of v
		function abs(v) {
		return v<0?-v:v;
		}
		# Examines locs to see if they are within wobble
		# locs should be pairs of positions to examine
		# If all are within wobble limit, they are "tooclose" and are dups
		function tooclose(locs) {
		cls=1;
		# abs function checks that distance is within wobble
		# AND function will require all distances to be within wobble
		for (i=1; i<length(locs); i+=2) {
		cls=and(cls,(abs(locs[i]-locs[i+1]) <= wobble));
		}
		return cls;
		}
		BEGIN{
		size=0;
		wobble=4;
		dupname=name"collisions_dups.txt";
		nodupname=name"collisions_nodups.txt";
		prevNF=0;
		}
		NF != 0{
		if (NF != prevNF) {
		# not a duplicate; handle dups array
		handledupsarray=1;
		}
		else {
		split(prev, p);
		test=1;
		# this code is testing that strand and chromosomes match
		# NEVA: make sure we are sorting by strand as well .... prob. in GAWK as well as sort
		for (i=2; i<=NF; i=i+7) {
		test=and(test, ($i == p[i]));
		test=and(test, ($(i+1) == p[i+1]));
		}
		# this looks at the first position being within wobble
		test=and(test, (abs($4-p[4])<= wobble));
		handledupsarray=!test;
		}

		if (!handledupsarray) {
		line[size]=$0;
		size++;
		}
		else {
		if (size >= 2) {
		for (j=0; j<size; j++) {
		split(line[j], check1);
		if (!(j in dups)) {
		for (k=j+1; k<size; k++) {
		split(line[k], check2);
		ind=1;
		# set up locs array to check distance
		for (x=4; x<=length(check1); x=x+7) {
		locs[ind]=check1[x];
		locs[ind+1]=check2[x];
		ind=ind+2;
		}
		if (tooclose(locs)) {
		dups[k]++; #places a 1 at dups[k]
		}
		delete locs;
		}
		}
		}
		# print dups out to dup file, non-dups out to non-dup file
		for (j=0; j<size; j++) {
		if (j in dups) {
		print line[j] > dupname
		}
		else {
		print line[j] > nodupname
		}
		}
		}
		else { # size=1
		print line[0] > nodupname;
		}
		delete line;
		delete dups;
		size=1;
		line[0]=$0;
		}
		prev=$0;
		prevNF=NF;
		}
		END {
		if (size >= 2) {
		for (j=0; j<size; j++) {
		split(line[j], check1);
		if (!(j in dups)) {
		for (k=j+1; k<size; k++) {
		split(line[k], check2);
		ind=1;
		# set up locs array to check distance
		for (x=4; x<=length(check1); x=x+7) {
		locs[ind]=check1[x];
		locs[ind+1]=check2[x];
		ind=ind+2;
		}
		if (tooclose(locs)) {
		dups[k]++; #places a 1 at dups[k]
		}
		delete locs;
		}
		}
		}
		# print dups out to dup file, non-dups out to non-dup file
		for (j=0; j<size; j++) {
		if (j in dups) {
		print line[j] > dupname
		}
		else {
		print line[j] > nodupname
		}
		}
		}
		else { # size=1
		print line[0] > nodupname;
		}
		}
		No newline at end of file

CPU/common/countligations.sh

+4 −2

Original line number	Diff line number	Diff line
		@@ -25,12 +25,14 @@
		#
		# Small helper script to count reads with ligation junction
		# Juicer version 1.5
		export LC_ALL=C
		export LC_COLLATE=C
		if [ "$usegzip" -eq 1 ]
		then
		num1=$(paste <(gunzip -c $name1$ext) <(gunzip -c $name2$ext) \| grep -cE $ligation)
		num1=$(paste <(gunzip -c $name1$ext) <(gunzip -c $name2$ext) \| awk '!((NR+2)%4)' \| grep -cE $ligation)
		num2=$(gunzip -c ${name1}${ext} \| wc -l \| awk '{print $1}')
		else
		num1=$(paste $name1$ext $name2$ext \| grep -cE $ligation)
		num1=$(paste $name1$ext $name2$ext \| awk '!((NR+2)%4)' \| grep -cE $ligation)
		num2=$(wc -l ${name1}${ext} \| awk '{print $1}')
		fi
		echo -ne "$num1 " > ${name}${ext}_norm.txt.res.txt

Admin message