Merge pull request #256 from aidenlab/encode (b706453b) · Commits · Chaos / chaos.juicer

AWS/scripts/dups.awk

+17 −2

Original line number	Diff line number	Diff line
		@@ -50,8 +50,14 @@ function optcheck(tile1,tile2,x1,x2,y1,y2) {
		# Executed once, before beginning the file read
		BEGIN {
		i=0;
		if (length(nowobble)==0) {
		wobble1=4;
		wobble2=4;
		}
		else {
		wobble1=0;
		wobble2=0;
		}
		# names of output files
		# the variable "name" can be set via the -v flag
		dupname=name"dups.txt";
		@@ -95,6 +101,10 @@ $1 != p1 \|\| $2 != p2 \|\| $4 != p4 \|\| $5 != p5 \|\| $6 != p6 \|\| $8 != p8 \|\| abs($3-p
		}
		else dups[k]++; #places a 1 at dups[k]
		}
		if (abs(pos1[j]-pos1[k])>wobble1) {
		#print "test", pos1[j], pos1[k]
		break
		}
		}
		}
		}
		@@ -173,6 +183,11 @@ END {
		dups[k]++;
		}
		}
		if (abs(pos1[j]-pos1[k])>wobble1) {
		#print "test", pos1[j], pos1[k]
		break

		}
		}
		}
		}

AWS/scripts/juicer.sh

+15 −8

Original line number	Diff line number	Diff line
		@@ -100,11 +100,12 @@ genomeID="hg19"
		shortreadend=0
		# description, default empty
		about=""
		nofrag=0

		nofrag=1
		# use wobble for dedupping by default (not just exact matches)
		justexact=0

		## Read arguments
		usageHelp="Usage: ${0##*/} [-g genomeID] [-d topDir] [-q queue] [-l long queue] [-s site]\n [-a about] [-R end] [-S stage] [-p chrom.sizes path]\n [-y restriction site file] [-z reference genome file]\n [-C chunk size] [-D Juicer scripts directory]\n [-Q queue time limit] [-L long queue time limit] [-b ligation] [-t threads]\n [-r] [-h] [-x]"
		usageHelp="Usage: ${0##*/} [-g genomeID] [-d topDir] [-q queue] [-l long queue] [-s site]\n [-a about] [-R end] [-S stage] [-p chrom.sizes path]\n [-y restriction site file] [-z reference genome file]\n [-C chunk size] [-D Juicer scripts directory]\n [-Q queue time limit] [-L long queue time limit] [-b ligation] [-t threads]\n [-r] [-h] [-f] [-j]"
		genomeHelp="* [genomeID] must be defined in the script, e.g. \"hg19\" or \"mm10\" (default \n \"$genomeID\"); alternatively, it can be defined using the -z command"
		dirHelp="* [topDir] is the top level directory (default\n \"$topDir\")\n [topDir]/fastq must contain the fastq files\n [topDir]/splits will be created to contain the temporary split files\n [topDir]/aligned will be created for the final alignment"
		queueHelp="* [queue] is the queue for running alignments (default \"$queue\")"
		@@ -113,7 +114,7 @@ siteHelp="* [site] must be defined in the script, e.g. \"HindIII\" or \"MboI\"
		aboutHelp="* [about]: enter description of experiment, enclosed in single quotes"
		shortHelp="* -r: use the short read version of the aligner, bwa aln\n (default: long read, bwa mem)"
		shortHelp2="* [end]: use the short read aligner on read end, must be one of 1 or 2 "
		stageHelp="* [stage]: must be one of \"merge\", \"dedup\", \"final\", \"postproc\", or \"early\".\n -Use \"merge\" when alignment has finished but the merged_sort file has not\n yet been created.\n -Use \"dedup\" when the files have been merged into merged_sort but\n merged_nodups has not yet been created.\n -Use \"final\" when the reads have been deduped into merged_nodups but the\n final stats and hic files have not yet been created.\n -Use \"postproc\" when the hic files have been created and only\n postprocessing feature annotation remains to be completed.\n -Use \"early\" for an early exit, before the final creation of the stats and\n hic files"
		stageHelp="* [stage]: must be one of \"merge\", \"dedup\", \"final\", \"postproc\", or \"early\".\n -Use \"merge\" when alignment has finished but the merged_sort file has not\n yet been created.\n -Use \"dedup\" when the files have been merged into merged_sort but\n merged_nodups has not yet been created.\n -Use \"final\" when the reads have been deduped into merged_nodups but the\n final stats and hic files have not yet been created.\n -Use \"postproc\" when the hic files have been created and only\n postprocessing feature annotation remains to be completed.\n -Use \"early\" for an early exit, before the final creation of the stats and\n hic files. Can also use -e flag to exit early"
		pathHelp="* [chrom.sizes path]: enter path for chrom.sizes file"
		siteFileHelp="* [restriction site file]: enter path for restriction site file (locations of\n restriction sites in genome; can be generated with the script\n misc/generate_site_positions.py)"
		chunkHelp="* [chunk size]: number of lines in split files, must be multiple of 4\n (default ${splitsize}, which equals $(awk -v ss=${splitsize} 'BEGIN{print ss/4000000}') million reads)"
		@@ -123,7 +124,9 @@ queueTimeHelp="* [queue time limit]: time limit for queue, i.e. -W 12:00 is 12 h
		longQueueTimeHelp="* [long queue time limit]: time limit for long queue, i.e. -W 168:00 is one week\n (default ${long_queue_time})"
		ligationHelp="* [ligation junction]: use this string when counting ligation junctions"
		threadsHelp="* [threads]: number of threads when running BWA alignment"
		excludeHelp="* -x: exclude fragment-delimited maps from hic file creation"
		justHelp="* -j: just exact duplicates excluded at dedupping step"
		excludeHelp="* -f: include fragment-delimited maps in hic file creation"
		earlyexitHelp="* -e: Use for an early exit, before the final creation of the hic files"
		helpHelp="* -h: print this help and exit"

		printHelpAndExit() {
		@@ -146,12 +149,14 @@ printHelpAndExit() {
		echo -e "$longQueueTimeHelp"
		echo -e "$ligationHelp"
		echo -e "$threadsHelp"
		echo -e "$justHelp"
		echo -e "$earlyexitHelp"
		echo "$excludeHelp"
		echo "$helpHelp"
		exit "$1"
		}

		while getopts "d:g:R:a:hrq:s:p:l:y:z:S:C:D:Q:L:b:t:x" opt; do
		while getopts "d:g:R:a:hrq:s:p:l:y:z:S:C:D:Q:L:b:t:fje" opt; do
		case $opt in
		g) genomeID=$OPTARG ;;
		h) printHelpAndExit 0;;
		@@ -170,9 +175,11 @@ while getopts "d:g:R:a:hrq:s:p:l:y:z:S:C:D:Q:L:b:t:x" opt; do
		D) juiceDir=$OPTARG ;;
		Q) queue_time=$OPTARG ;;
		L) long_queue_time=$OPTARG ;;
		x) nofrag=1 ;;
		f) nofrag=0 ;;
		b) ligation=$OPTARG ;;
		t) threads=$OPTARG ;;
		j) justexact=1 ;;
		e) earlyexit=1 ;;
		[?]) printHelpAndExit 1;;
		esac
		done
		@@ -821,7 +828,7 @@ then
		${waitstring}
		#BSUB -J "${groupname}_osplit"
		bkill -J ${groupname}_clean1
		awk -v queue=$long_queue -v outfile=${debugdir}/dedup-${groupname}.out -v juicedir=${juiceDir} -v dir=$outputdir -v queuetime=$long_queue_time -v groupname=$groupname -f ${juiceDir}/scripts/split_rmdups.awk $outputdir/merged_sort.txt
		awk -v queue=$long_queue -v outfile=${debugdir}/dedup-${groupname}.out -v juicedir=${juiceDir} -v dir=$outputdir -v queuetime=$long_queue_time -v groupname=$groupname -v justexact=$justexact -f ${juiceDir}/scripts/split_rmdups.awk $outputdir/merged_sort.txt
		KILLCLNUP

		# if it dies, cleanup and write to relaunch script

AWS/scripts/split_rmdups.awk

+42 −34

Original line number	Diff line number	Diff line
		@@ -26,7 +26,6 @@
		# Dedup script that submits deduping jobs after splitting at known
		# non-duplicates
		# Juicer version 1.5

		BEGIN{
		tot=0;
		name=0;
		@@ -36,7 +35,13 @@ BEGIN{
		if (tot >= 1000000) {
		if (p1 != $1 \|\| p2 != $2 \|\| p4 != $4 \|\| p5 != $5 \|\| p8 != $8) {
		sname=sprintf("%s_msplit%04d_", groupname, name);
		if (justexact) {
		sysstring = sprintf("bsub -W %s -o %s -q %s -J %s <<- EOF\nawk -v nowobble=1 -f %s/scripts/dups.awk -v name=%s/%s %s/split%04d;\nEOF\n", queuetime, outfile, queue, sname, juicedir, dir, sname, dir, name, dir, name);
		}
		else {
		sysstring = sprintf("bsub -W %s -o %s -q %s -J %s <<- EOF\nawk -f %s/scripts/dups.awk -v name=%s/%s %s/split%04d;\nEOF\n", queuetime, outfile, queue, sname, juicedir, dir, sname, dir, name, dir, name);
		}

		system(sysstring);
		if (name==0) {
		waitstring=sprintf("done(%s)", sname);
		@@ -55,7 +60,12 @@ BEGIN{
		}
		END {
		sname=sprintf("%s_msplit%04d_", groupname, name);
		if (justexact) {
		sysstring = sprintf("bsub -W %s -o %s -q %s -J %s <<-EOF\nawk -v nowobble=1 -f %s/scripts/dups.awk -v name=%s/%s %s/split%04d;\nEOF\n", queuetime, outfile, queue, sname, juicedir, dir, sname, dir, name, dir, name);
		}
		else {
		sysstring = sprintf("bsub -W %s -o %s -q %s -J %s <<-EOF\nawk -f %s/scripts/dups.awk -v name=%s/%s %s/split%04d;\nEOF\n", queuetime, outfile, queue, sname, juicedir, dir, sname, dir, name, dir, name);
		}
		system(sysstring);
		if (name==0) {
		waitstring=sprintf("done(%s)", sname);
		@@ -67,6 +77,4 @@ END {
		system(sysstring);
		sysstring = sprintf("bsub -W %s -o %s -q %s -J %s_rmsplit -w \"done(%s_catsplit)\" <<- EOF\n rm %s/_msplit_optdups.txt; rm %s/_msplit_dups.txt; rm %s/_msplit_merged_nodups.txt; rm %s/split*;\nEOF",queuetime, outfile, queue, groupname, groupname, dir, dir, dir, dir);
		system(sysstring);


		}

CPU/juicer.sh

+1 −0

Original line number	Diff line number	Diff line
		@@ -613,6 +613,7 @@ then
		fi
		fi


		if [ -n "$deduponly" ]
		then
		exit 0

README.md

+14 −17

Original line number	Diff line number	Diff line
		# Juicer
		# Read this first!!

		To access Juicer 1.6 (last stable release), please see [the Github Release](https://github.com/aidenlab/juicer/releases/tag/1.6). If you clone the Juicer repo directly from Github, it will clone Juicer 2, which is under active development. If you encounter any bugs, please let us know.

		# About Juicer

		Juicer is a platform for analyzing kilobase resolution Hi-C data. In this
		distribution, we include the pipeline for generating Hi-C maps from fastq raw
		data files and command line tools for feature annotation on the Hi-C maps.

		Juicer is currently in its beta release, Juicer version 1.5.
		The beta release for Juicer version 1.6 can be accessed via [the Github Release](https://github.com/aidenlab/juicer/releases/tag/1.6). The main repository on Github is now focused on the Juicer 2.0 release and is under active development.
		For general questions, please use
		[the Google Group](https://groups.google.com/forum/#!forum/3d-genomics).

		If you have further difficulties using Juicer, please do not
		hesitate to contact us (theaidenlab@gmail.com)
		hesitate to contact us (aidenlab@bcm.edu)

		**If you use Juicer in your research, please cite:
		Neva C. Durand, Muhammad S. Shamim, Ido Machol, Suhas S. P. Rao, Miriam H. Huntley, Eric S. Lander, and Erez Lieberman Aiden. "Juicer provides a one-click system for analyzing loop-resolution Hi-C experiments." Cell Systems 3(1), 2016.**

		# Documentation
		Please see [the wiki](https://github.com/theaidenlab/juicer/wiki) for extensive documentation.
		Please see [the wiki](https://github.com/aidenlab/juicer/wiki) for extensive documentation.

		# Questions?
		For FAQs, or for asking new questions, please see our forum: [aidenlab.org/forum.html](http://aidenlab.org/forum.html).
		@@ -60,7 +64,7 @@ Juicer currently works with the following resource management software:
		### Juicer tools requirements

		The minimum software requirement to run Juicer is a working Java installation
		(version >= 1.7) on Windows, Linux, and Mac OSX. We recommend using the
		(version >= 1.8) on Windows, Linux, and Mac OSX. We recommend using the
		latest Java version available, but please do not use the Java Beta Version.
		Minimum system requirements for running Java can be found at
		https://java.com/en/download/help/sysreq.xml
		@@ -98,6 +102,8 @@ For best performance, use a dedicated GPU. You may also be able to obtain
		access to GPU clusters through Amazon Web Services or a local research
		institution.

		If you cannot access a GPU, you can run the [CPU version of HiCCUPS](https://github.com/aidenlab/juicer/wiki/CPU-HiCCUPS) directly using the `.hic` file and Juicer Tools.

		### Building new jars

		See the Juicebox documentation at <https://github.com/theaidenlab/Juicebox> for
		@@ -176,24 +182,15 @@ Command Line Tools Usage
		------------------------
		Detailed documentation about the command line tools can be found on the wiki:

		* [Annotating features with Arrowhead, HiCCUPS, MotifFinder, APA, Eigenvector, and Pearsons](https://github.com/theaidenlab/juicer/wiki/Feature-Annotation)
		* [Creating .hic with Pre](https://github.com/theaidenlab/juicer/wiki/Pre)
		* [Extracting data from .hic files with dump](https://github.com/theaidenlab/juicer/wiki/Data-Extraction)
		* [Annotating features with Arrowhead, HiCCUPS, MotifFinder, APA, Eigenvector, and Pearsons](https://github.com/aidenlab/juicer/wiki/Feature-Annotation)
		* [Creating .hic with Pre](https://github.com/aidenlab/juicer/wiki/Pre)
		* [Extracting data from .hic files with straw](https://github.com/aidenlab/straw)

		To launch the command line tools, use the shell script “juicer_tools” on Unix/MacOS
		or type
		```
		java -jar juicer_tools.jar (command...) [flags...] <parameters...>`
		```
		There are different flavors of juicer_tools that depend on the CUDA version.
		If you do not use GPUs, these versions are equivalent. Otherwise,
		juicer_tools.X.X.jar uses CUDA version X.X

		For HiCCUPS loop calling without the shell or bat script, you will need to
		call:
		`java -Xms512m -Xmx2048m -Djava.library.path=path/to/natives/ -jar juicer_tools.jar hiccups [flags...] <parameters...>`
		where path/to/natives is the path to the native libraries used for Jcuda
		By default, these are located in the lib/jcuda folder.

		In the command line tools, there are several analysis functions:

Admin message