Commit a9d984ad authored by TomKellyGenetics's avatar TomKellyGenetics
Browse files

Merge branch 'kai' of dgt-gitlab.gsc.riken.jp:tom/cellranger_convert

parents 74e9b127 0ab06d24
Loading
Loading
Loading
Loading
+143 −165
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ if [[ -z $cellrangerpass ]]; then
    echo "cellranger command is not found."
    exit 1
fi
ver_info=`paste -d "\n" <(cellranger count --version) <(echo conversion script version 0.2.0.9002) | head -n 3 | tail -n 2`
ver_info=`paste -d "\n" <(cellranger count --version) <(echo conversion script version 0.2.0.900333) | head -n 3 | tail -n 2`
##########


@@ -39,28 +39,28 @@ echo "Running launch_universc.sh in '$SCRIPT_DIR'"
#####usage statement#####
help='
Usage:
  bash $(basename "$0") -R1 FILE1 -R2 FILE2 -t TECHNOLOGY -i ID -r REFERENCE [--option OPT]
  bash $(basename "$0") -R1 READ1_LANE1 READ1_LANE2 -R2 READ2_LANE1 READ2_LANE2 -t TECHNOLOGY -i ID -r REFERENCE [--option=OPT]
  bash $(basename "$0") -f SAMPLE_LANE -t TECHNOLOGY -i ID -r REFERENCE [--option=OPT]
  bash $(basename "$0") -f \"SAMPLE_LANE1 SAMPLE_LANE2\" -t TECHNOLOGY -i ID -r REFERENCE [--option=OPT]
  bash $(basename "$0") -v
  bash $(basename "$0") -h
  bash $(basename "$0") -t TECHNOLOGY --setup
  bash '$(basename $0)' -R1 FILE1 -R2 FILE2 -t TECHNOLOGY -i ID -r REFERENCE [--option OPT]
  bash '$(basename $0)' -R1 READ1_LANE1 READ1_LANE2 -R2 READ2_LANE1 READ2_LANE2 -t TECHNOLOGY -i ID -r REFERENCE [--option OPT]
  bash '$(basename $0)' -f SAMPLE_LANE -t TECHNOLOGY -i ID -r REFERENCE [--option OPT]
  bash '$(basename $0)' -f SAMPLE_LANE1 SAMPLE_LANE2 -t TECHNOLOGY -i ID -r REFERENCE [--option OPT]
  bash '$(basename $0)' -v
  bash '$(basename $0)' -h
  bash '$(basename $0)' -t TECHNOLOGY --setup

Convert sequencing data (FASTQ) from various platforms for compatibility with 10x Genomics and run cellranger count

Mandatory arguments to long options are mandatory for short options too.
  -s,  --setup                  Set up whitelists for compatibility with new technology
  -t,  --technology=PLATFORM    Name of technology used to generate data (10x, nadia, icell8)
  -R1, --read1=FILE             Read 1 FASTQ file to pass to cellranger (cell barcodes and umi)
  -R2, --read2=FILE             Read 2 FASTQ file to pass to cellranger
  -f,  --file=NAME              Name of FASTQ files to pass to cellranger (prefix before R1 or R2)
  -i,  --id=ID                  A unique run id, used to name output folder
  -d,  --description=TEXT       Sample description to embed in output files.
  -r,  --reference=DIR          Path of directory containing 10x-compatible reference.
  -c,  --chemistry=CHEM         Assay configuration, autodetection is not possible for converted files: 'SC3Pv2' (default), 'SC5P-PE', or 'SC5P-R2'
  -n,  --force-cells=NUM        Force pipeline to use this number of cells, bypassing the cell detection algorithm.
  -j,  --jobmode=MODE           Job manager to use. Valid options: 'local' (default), 'sge', 'lsf', or a .template file
  -t,  --technology PLATFORM    Name of technology used to generate data (10x, nadia, icell8)
  -R1, --read1 FILE             Read 1 FASTQ file to pass to cellranger (cell barcodes and umi)
  -R2, --read2 FILE             Read 2 FASTQ file to pass to cellranger
  -f,  --file NAME              Name of FASTQ files to pass to cellranger (prefix before R1 or R2)
  -i,  --id ID                  A unique run id, used to name output folder
  -d,  --description TEXT       Sample description to embed in output files.
  -r,  --reference DIR          Path of directory containing 10x-compatible reference.
  -c,  --chemistry CHEM         Assay configuration, autodetection is not possible for converted files: 'SC3Pv2' (default), 'SC5P-PE', or 'SC5P-R2'
  -n,  --force-cells NUM        Force pipeline to use this number of cells, bypassing the cell detection algorithm.
  -j,  --jobmode MODE           Job manager to use. Valid options: 'local' (default), 'sge', 'lsf', or a .template file
  -w,  --overwrite              How to carry out FASTQ file conversions: 'skip' to skip conversion, 'convert' to overwrite all preexisting converted files, and 'keep' (default) to only convert files that do not already exist.
  -h,  --help                   Display this help and exit
  -v,  --version                Output version information and exit
@@ -82,8 +82,18 @@ fi



#####reading in options#####
setup="false"
#####options#####
#set options
DIR=`which cellranger` #location of cellranger
VERSION=`cellranger count --version | head -n 2 | tail -n 1 | cut -d"(" -f2 | cut -d")" -f1` #get cellranger version
lockfile=${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock #path for .lock file
lastcallfile=${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called #path for .last_called
lastcall=`cat $lastcallfile`
barcodefolder=${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes #folder with the barcodes
cdrIN="input4cellranger" #name of the directory with all FASTQ files given to cellranger

#variable options
setup=false
read1=()
read2=()
SAMPLE=""
@@ -94,10 +104,7 @@ reference=""
ncells=""
chemistry=""
jobmode=""
convert="keep"

DIR=`which cellranger`
VERSION=`cellranger count --version | head -n 2 | tail -n 1 | cut -d"(" -f2 | cut -d")" -f1`
convert=keep

next=false
for op in "$@"; do
@@ -123,6 +130,7 @@ for op in "$@"; do
        shift
            if [[ $1 != "" ]]; then
                technology="${1/%\//}"
                technology=`echo "$technology" | tr '[:upper:]' '[:lower:]'`
                next=true
                shift
            else
@@ -162,8 +170,7 @@ for op in "$@"; do
            ;;
        -f|--file)
            shift
            if [[ "$1" != "" ]]
                then
            if [[ "$1" != "" ]]; then
                arg=$1
                while [[ ! "$arg" == "-"* ]] && [[ "$arg" != "" ]]; do
                    read1+=("${1}_R1_001")
@@ -268,53 +275,7 @@ for op in "$@"; do
done
##########

#####check if UniverSC is running already#####
#create .lock file if none exists
if [[ ! -f  ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock ]]; then
    echo "creating lock file"
    echo 0 > ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock
fi

#check if jobs running (check value in .lock file)
echo "checking .lock file"
lock=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock`

if [[ ! $lock == "0" ]]; then
    echo " $lock number of cellranger ${VERSION} jobs already running in ${DIR}"
    #check technology current running
    if [[ -f ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called ]]; then
        last=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called`
        echo " running $lock jobs with technology $last"
        #check if the technology running is different from the current convert call
        if [[ $last == $technology ]]; then
            echo " no conflict detected"
            #add disable increment for setup calls (which are not counted or removed)
            if [[ $setup == false ]]; then
                #add current job to lock
                echo " increment lock"
                lock=$(($lock+1))
                echo $lock > ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock
                echo " call accepted: running $lock cellranger jobs on $technology"
            fi
	else
	    echo "Error: conflict between technology selected for the new job ($technology) and for $lock jobs currently running ($last)"
            echo "barcode whitelist configured and locked for currently running technology: $last"
            echo "remove ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock if $last jobs have completed or aborted"
            exit 1
        fi
    fi
else
    #initialise lock file if first call (no other jobs running)
    #add disable increment for setup calls (which are not counted or removed)
    if [[ $setup == false ]]; then
        #add current job to lock
        echo " increment lock"
        lock=$(($lock+1))
        echo " call accepted: running no other cellranger jobs"
        echo $lock > ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock
    fi
fi
##########

#####check if input maches expected inputs#####
if [[ $verbose == "true" ]]; then
@@ -322,26 +283,21 @@ if [[ $verbose == "true" ]]; then
fi

#check if technology matches expected inputs
technology=`echo "$technology" | tr '[:upper:]' '[:lower:]'`
if [[ "$technology" != "10x" ]] && [[ "$technology" != "nadia" ]] && [[ "$technology" != "icell8" ]]; then
    echo "Error: option -t needs to be 10x, nadia, or icell8"
    exit 1
fi

#check if setup needs to be run before analysis
if [[ -z $setup ]]; then
    setup=false
fi

#check for presence of read1 and read2 files
if [[ ${#read1[@]} -eq 0 ]] && [[ $setup == "false" ]]; then
if [[ $setup == "false" ]]; then
    if [[ ${#read1[@]} -eq 0 ]]; then
        echo "Error: option -R1 or --file is required"
        exit 1
fi
if [[ ${#read2[@]} -eq 0 ]] && [[ $setup == "false" ]]; then
    elif [[ ${#read2[@]} -eq 0 ]]; then
        echo "Error: option -R2 or --file is required"
        exit 1
    fi
fi

#check for file type (extension) for files
##allows incomplete file names and processing compressed files
@@ -357,9 +313,11 @@ for i in ${!read1[@]}; do
            read=`echo $read | sed -e "s/\.gz//g"`
        fi
        if [[ $read != *"fastq" ]] && [[ $read != *"fq" ]]; then
            echo "***Warning: file $read is assubed to be in fastq format***"
            echo "***Warning: file $read is assumed to be in fastq format***"
        fi
        if [[ $verbose == "true" ]]; then
            echo "  $read"
        fi
    elif [[ -f $read ]]; then
        if [[ $read == *"gz" ]]; then
            gunzip -k $read
@@ -369,7 +327,9 @@ for i in ${!read1[@]}; do
        if [[ $read != *"fastq" ]] && [[ $read != *"fq" ]]; then
            echo "***Warning: file $read is assubed to be in fastq format***"
        fi
        if [[ $verbose == "true" ]]; then
            echo "  $read"
        fi
    else
        echo "Error: $read not found"
        exit 1
@@ -391,7 +351,9 @@ for i in ${!read2[@]}; do
        if [[ $read != *"fastq" ]] && [[ $read != *"fq" ]]; then
            echo "***Warning: file $read is assubed to be in fastq format***"
        fi
        if [[ $verbose == "true" ]]; then
	    echo "  $read"
	fi
    elif [[ -f $read ]]; then
        if [[ $read == *"gz" ]]; then
            gunzip -k $read
@@ -401,7 +363,9 @@ for i in ${!read2[@]}; do
        if [[ $read != *"fastq" ]] && [[ $read != *"fq" ]]; then
            echo "***Warning: file $read is assubed to be in fastq format***"
        fi
        if [[ $verbose = "true" ]]; then
            echo "  $read"
        fi
    else
        echo "Error: $read not found"
        exit 1
@@ -413,6 +377,7 @@ done
if [[ $verbose == "true" ]]; then
    echo " checking file name for $read1 ..."
fi

for i in ${!read1[@]}; do
    read=${read1[$i]}
    if [[ -h $read ]]; then
@@ -607,7 +572,7 @@ done
LANE=$(echo "${LANE[@]}" | tr ' ' '\n' | sort -u | tr '\n' ',' | sed 's/,$//')

#check if ID is present
if [[ -z $id ]] && ! [[ ${#read1[@]} -eq 0 ]]; then
if [[ -z $id ]] && [[ ${#read1[@]} -eq 0 ]]; then
    echo "Error: option --id is required"
    exit 1
elif [[ $id == *" "* ]]; then
@@ -616,10 +581,12 @@ elif [[ $id == *" "* ]]; then
fi

#check if reference is present
if [[ -z $reference ]] && [[ $setup == "false" ]]; then
    echo "Error: option --reference is required"
if [[ -z $reference ]]; then
    if [[ $setup == "false" ]] || [[ ${#read1[@]} -ne 0 ]] || [[ ${#read2[@]} -ne 0 ]]; then
        echo "Error: option --reference is required";
        exit 1
    fi
fi

#check if ncells is an integer
int='^[0-9]+$'
@@ -651,6 +618,53 @@ if [[ "$convert" != "keep" ]] && [[ "$convert" != "skip" ]] && [[ "$convert" !=
    echo "Error: option --overwrite needs to be keep, skip, or convert"
    exit 1
fi

#check if setup needs to be run before analysis
if [[ -z $setup ]]; then
    setup=false
fi
if [[ $lastcall != $technology ]]; then
    setup=true
fi
##########



#####check if UniverSC is running already#####
#set up .lock file
if [[ ! -f $lockfile ]]; then
    echo "creating .lock file"
    echo 0 > $lockfile
else
    #check if jobs running (check value in .lock file)
    echo "checking .lock file"
    lock=`cat $lockfile`
    
    if [[ $lock -eq 0 ]]; then
        echo " call accepted: no other cellranger jobs running"
        lock=1
        echo $lock > $lockfile
    else
        if [[ -f $lastcallfile ]]; then
            echo " total of $lock cellranger ${VERSION} jobs already running in ${DIR} with technology $lastcall"
            
            #check if the technology running is different from the current convert call
            if [[ $lastcall == $technology ]]; then
                echo " call accepted: no conflict detected with other jobs currently running"
                #add current job to lock
                lock=$(($lock+1))
                echo $lock > $lockfile
	    else
	        echo "Error: conflict between technology selected for the new job ($technology) and other jobs currently running ($lastcall)"
                echo "barcode whitelist configured and locked for currently running technology: $lastcall"
                echo "remove $lockfile if $lastcall jobs have completed or aborted"
                exit 1
            fi
        else
            echo "Error: $lastcallfile not found"
        fi
    fi
fi
##########


@@ -659,17 +673,13 @@ fi
echo ""
echo "#####Input information#####"
echo "SETUP: $setup"
if ! [[ $setup == "false" ]]; then
    echo "***Warning: whitelist is converted for compatibility, valid barcodes cannot be detected accurately with this technology***"
echo "FORMAT: $technology"
if [[ $technology == "nadia" ]]; then
    echo "***Warning: whitelist is converted for compatibility with $technology, valid barcodes cannot be detected accurately with this technology***"
fi
if [[ ${#read1[@]} -eq 0 ]] && [[ ${#read1[@]} -eq 0 ]]; then
    echo "***Warning: no FASTQ files were selected, launch_universc.sh will exit after setting up the whitelist***"
fi
fi
if ! [[ "$technology" == "10x" ]]; then
    echo "FORMAT: $technology"
else
    echo "FORMAT: $technology (no conversion)"
fi
if ! [[ ${#read1[@]} -eq 0 ]]; then
    echo "INPUT(R1):"
    for i in ${!read1[@]}; do
@@ -714,7 +724,7 @@ echo ""
#run setup if called
if [[ $setup == "true" ]]; then
    echo "setup begin"
    if [[ ! -w ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes ]]; then
    if [[ ! -w $barcodefolder ]]; then
        echo "Error: launch_universc.sh can only be run cellranger installed locally"
        echo "Running cellranger installed at $DIR"
        echo "Install cellranger in a directory with write permissions such as /home/`whoami`/local"
@@ -723,13 +733,14 @@ if [[ $setup == "true" ]]; then
        echo " `whereis cellranger`"
        exit 1
    fi
    cd ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes
    
    cd $barcodefolder
    
    if [[ -f 3M-february-2018.txt ]]; then
        gzip -f 3M-february-2018.txt
    fi
    
    echo "updating barcodes in ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes for cellranger version $VERSION installed in $DIR ..."
    echo "updating barcodes in $barcodefolder for cellranger version $VERSION installed in $DIR ..."
    
    if [[ $technology == "10x" ]]; then
        #restore 10x barcodes if scripts has already been run (allows changing Nadia to iCELL8)
@@ -752,8 +763,7 @@ if [[ $setup == "true" ]]; then
        #if cellranger version is 3 or greater, restore assert functions
        if [[ `printf '%s\n' '$VERSION 3.0.0' | sort -V | head -n 1` != $VERSION ]]; then
            #restore functions if other technology run previously
            last=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called`
            if [[ $last != $technology ]]; then
            if [[ $lastcall != $technology ]]; then
                sed -i "s/#assert barcode_idx >= prev_barcode_idx/assert barcode_idx >= prev_barcode_idx/g" ${DIR}-cs/${VERSION}/mro/stages/counter/report_molecules/__init__.py
                sed -i "s/#assert np.array_equal(in_mc.get_barcodes(), barcodes)/assert np.array_equal(in_mc.get_barcodes(), barcodes)/g" ${DIR}-cs/${VERSION}/lib/python/cellranger/molecule_counter.py
            fi
@@ -782,7 +792,6 @@ if [[ $setup == "true" ]]; then
            echo " backing up whitelist of version 2 kit ..."
            cp 737K-august-2016.txt 737K-august-2016.txt.backup
        fi
        echo "Warning: Valid barcodes cannot be calculated accurately for Nadia or DropSeq technology"
        #combine 10x and Nadia barcodes
        cat nadia_barcode.txt 737K-august-2016.txt.backup | sort | uniq > 737K-august-2016.txt
        echo " whitelist converted for Nadia compatibility with version 2 kit."
@@ -814,8 +823,7 @@ if [[ $setup == "true" ]]; then
        #if cellranger version is 3 or greater, restore assert functions
        if [[ `printf '%s\n' '$VERSION 3.0.0' | sort -V | head -n 1` != $VERSION ]]; then
            #disable functions if 10x technology run previously
            last=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called`
            if [[ $last != "10x" ]]; then
            if [[ $lastcall != "10x" ]]; then
                sed -i "s/assert barcode_idx >= prev_barcode_idx/#assert barcode_idx >= prev_barcode_idx/g" ${DIR}-cs/${VERSION}/mro/stages/counter/report_molecules/__init__.py
                sed -i "s/assert np.array_equal(in_mc.get_barcodes(), barcodes)/#assert np.array_equal(in_mc.get_barcodes(), barcodes)/g"  ${DIR}-cs/${VERSION}/lib/python/cellranger/molecule_counter.py
            fi
@@ -880,8 +888,7 @@ if [[ $setup == "true" ]]; then
        #if cellranger version is 3 or greater, restore assert functions
        if [[ `printf '%s\n' '$VERSION 3.0.0' | sort -V | head -n 1` != $VERSION ]]; then
            #disable functions if 10x technology run previously
            last=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called`
            if [[ $last != "10x" ]]; then
            if [[ $lastcall != "10x" ]]; then
                sed -i "s/assert barcode_idx >= prev_barcode_idx/#assert barcode_idx >= prev_barcode_idx/g" ${DIR}-cs/${VERSION}/mro/stages/counter/report_molecules/__init__.py
                sed -i "s/assert np.array_equal(in_mc.get_barcodes(), barcodes)/#assert np.array_equal(in_mc.get_barcodes(), barcodes)/g" ${DIR}-cs/${VERSION}/lib/python/cellranger/molecule_counter.py
            fi
@@ -890,16 +897,22 @@ if [[ $setup == "true" ]]; then
            echo "$DIR ready for $technology"
        fi
    else
        lock=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock`
        lock=$(($lock-1))
        echo $lock > $lockfile
        echo "Error: technology ($technology) is not supported"
        cd -
        exit 1
    fi
    echo $technology > ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called
    echo $technology > $lastcallfile
    cd -
    echo "setup complete"
fi

if [[ ${#read1[@]} -eq 0 ]] && [[ ${#read2[@]} -eq 0 ]]; then
    lock=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock`
    lock=$(($lock-1))
    echo $lock > ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock
    echo " whitelist converted and no FASTQ files are selected. exiting launch_universc.sh"
    exit 0
fi
@@ -907,37 +920,8 @@ fi



#####detect whitelist directory#####
#check whitelist and run --setup if needed
echo "checking whitelist ..."
if [[ -f ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called ]]; then
    #run again if last technology called is different from technology
    last=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called`
    if [[ $last != $technology ]]; then
        echo " running setup on $technology on whitelist in ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes ..."
        if [[ $verbose == "true" ]]; then
            echo " last: $last"
            echo " technology: $technology"
        fi
        bash $(basename "$0") -t $technology --setup
        setup=false
        if [[ $convert == "skip" ]]; then
            echo " ***Warning: technology changed to $technology since last run. FASTQ file conversion will be carreid out.***"
            convert="convert"
        fi
    fi
else    
    echo " using setup $technology from previous whitelist configuration ..."
    echo $technology > ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called
fi
echo "check complete"
##########



#####create directory with files fed to cellranger#####
echo "creating a folder for all cellranger input files ..."
crIN="cellranger"
convFiles=()

if [[ ! -d $crIN ]]; then
@@ -1032,6 +1016,8 @@ fi

#####run cellranger#####
echo "running cellranger ..."
echo ""
echo "#####cellranger#####"
d=""
if [[ -n $description ]]; then
    d="--description=$description"
@@ -1071,6 +1057,8 @@ cellranger count --id=$id \
#        --nopreflight
end=`date +%s`
runtime=$((end-start))
echo "##########"
echo ""
##########


@@ -1078,26 +1066,17 @@ runtime=$((end-start))
#####remove files if convert is not running elsewhere#####
echo "updating .lock file"

#reset lock counter (read in case changed by other jobs)
lock=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock`

#remove currewnt job from counter (successfully completed)
lock=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock`
lock=$(($lock-1))

#check if jobs running
if [[ $lock -ge 1 ]]; then
    echo "$lock number of cellranger ${VERSION} jobs still running in ${DIR}"
    #check technology current running
    if [[ -f ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called ]]; then
        last=`cat ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.last_called`
        echo "running technology $last with $lock jobs"
    fi
fi

#remove .lock file if no other jobs running exists (prevents negative values allowing technologies to run at same time)
if [[ $lock -le 0 ]]; then
    echo "no other jobs currently running: lock files cleared for cellranger ${VERSION} in ${DIR}"
    echo " total of $lock jobs for $lastcall technology are still run by cellranger ${VERSION} in ${DIR}"
else
    echo " no other jobs currently run by cellranger ${VERSION} in ${DIR}"
    echo " no conflicts: whitelist can now be changed for other technologies"
    rm -f ${DIR}-cs/${VERSION}/lib/python/cellranger/barcodes/.lock
    rm -f $lockfile
fi
##########

@@ -1117,4 +1096,3 @@ echo "$log"
##########

exit 0