Implemented per cell data extractor. (53abe621) · Commits · github_fork / Universc

ExtractBasicStats.pl

0 → 100755

+314 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/perl

		#########################################
		# #
		# Written by Kai Battenberg #
		# Plant Symbiosis Research Team #
		# #
		#########################################

		use strict;
		use warnings;
		use List::Util qw( min max sum);

		#####Options#####
		my $indir = $ARGV[0];
		##########



		#####Check input#####
		#collect files needed
		my $feature_file = $indir."/outs/raw_feature_bc_matrix/features.tsv.gz";
		my $raw_barcode_file = $indir."/outs/raw_feature_bc_matrix/barcodes.tsv.gz";
		my $filtered_barcode_file = $indir."/outs/filtered_feature_bc_matrix/barcodes.tsv.gz";
		my $matrix_file = $indir."/outs/filtered_feature_bc_matrix/matrix.mtx.gz";
		my $summary_file = $indir."/outs/metrics_summary.csv";
		my $bam_file = $indir."/outs/possorted_genome_bam.bam";
		my $out_file = $indir."/outs/basic_stats.txt";

		if (! -e $indir) {
		die "Error: Data directory needs to be specified.\n";
		}
		elsif (! -e $feature_file \|\| ! -e $raw_barcode_file \|\| ! -e $filtered_barcode_file \|\| ! -e $matrix_file \|\| ! -e $summary_file \|\| ! -e $bam_file) {
		die "Error: Iether selected directory is not a cellranger output or some critical files are missing.\n";
		}
		##########



		#####MAIN#####
		#gene count
		print "getting gene count.\n";

		my $ref_gene_count = `zcat $feature_file \| wc -l`;
		chomp $ref_gene_count;

		#cell count
		print "getting cell count.\n";

		my $cellcount_raw;
		$cellcount_raw = `zcat $raw_barcode_file \| wc -l`;
		chomp $cellcount_raw;

		my $cellcount_filtered;
		$cellcount_filtered = `zcat $filtered_barcode_file \| wc -l`;
		chomp $cellcount_filtered;

		my %percell;
		my $barcodes;
		$barcodes = `zcat $filtered_barcode_file \| cut -f1 -d'-'`;
		chomp $barcodes;
		my @barcodes = split (/\n/, $barcodes);

		foreach my $barcode (@barcodes) {
		my %barcode;
		$percell{$barcode} = \%barcode;
		}

		#raw read count
		print "getting raw read count.\n";

		my $readcount_raw = `tail -n 1 $summary_file`;
		chomp $readcount_raw;

		my @readcount_raw = split (//, $readcount_raw);
		my $quotestat = 0;
		foreach my $character (@readcount_raw) {
		if ($character =~ m/\"/ && $quotestat == 0) {
		$character = "";
		$quotestat = 1;
		next;
		}
		elsif ($character =~ m/\"/ && $quotestat == 1) {
		$character = "";
		$quotestat = 0;
		next;
		}
		elsif ($character =~ m/\,/ && $quotestat == 1) {
		$character = "";
		next;
		}
		else {
		next;
		}
		}
		$readcount_raw = join ("", @readcount_raw);
		$readcount_raw = (split (/\,/, $readcount_raw))[3];

		#assigned read count
		print "getting assigned read count.\n";

		my $assigned_reads_percell = `samtools view $bam_file \| grep 'CB:Z:' \| sed 's\/CB:Z:\$[ACGT]\$./\\1/' \| awk '{print \$1, \$NF}' \| sort \| uniq \| cut -f2 -d' ' \| sort \| uniq -c`;
		chomp $assigned_reads_percell;
		my @assigned_reads_percell = split (/\n/, $assigned_reads_percell);

		foreach my $barcode (@assigned_reads_percell) {
		my $count = $barcode;
		$count =~ s/ +/ /g;
		$count =~ s/^ //;
		$count =~ s/ $//;
		my @count = split (/ /, $count);

		if (exists $percell{$count[1]}) {
		my %count;
		$count{ASSIGNED} = $count[0];
		$percell{$count[1]} = \%count;
		}
		}
		foreach my $barcode (sort keys %percell) {
		my %barcode = %{ $percell{$barcode} };
		if (!$barcode{ASSIGNED}) {
		$barcode{ASSIGNED} = 0;
		}
		$percell{$barcode} = \%barcode;
		}

		#mapped read count
		print "getting mapped read count.\n";

		my $mapped_reads_percell = `samtools view -F 4 $bam_file \| grep 'CB:Z:' \| sed 's\/CB:Z:\$[ACGT]\$./\\1/' \| awk '{print \$1, \$NF}' \| sort \| uniq \| cut -f2 -d' ' \| sort \| uniq -c`;
		chomp $mapped_reads_percell;
		my @mapped_reads_percell = split (/\n/, $mapped_reads_percell);

		foreach my $barcode (@mapped_reads_percell) {
		my $count = $barcode;
		$count =~ s/ +/ /g;
		$count =~ s/^ //;
		$count =~ s/ $//;
		my @count = split (/ /, $count);

		if (exists $percell{$count[1]}) {
		my %count = %{ $percell{$count[1]} };
		$count{MAPPED} = $count[0];
		$percell{$count[1]} = \%count;
		}
		}
		foreach my $barcode (sort keys %percell) {
		my %barcode = %{ $percell{$barcode} };
		if (!$barcode{MAPPED}) {
		$barcode{MAPPED} = 0;
		}
		$percell{$barcode} = \%barcode;
		}

		#UMI count
		print "getting mapped UMI count.\n";

		my %barcodes;
		my $id = 0;
		foreach my $barcode (@barcodes) {
		$id++;
		$barcodes{$id} = $barcode;
		}

		my $umicount = `zcat $matrix_file \| tail -n +4 \| cut -f2,3 -d ' '`;
		chomp $umicount;
		my @umicount = split (/\n/, $umicount);
		my %umicount;
		foreach my $entry (@umicount) {
		my @data = split (/ /, $entry);
		if (!exists $umicount{$data[0]}) {
		$umicount{$data[0]} = $data[1];
		}
		else {
		$umicount{$data[0]} = $umicount{$data[0]} + $data[1];
		}
		}
		foreach my $id (keys %umicount) {
		if (exists $percell{ $barcodes{$id} }) {
		my %data = %{ $percell{ $barcodes{$id} } };
		$data{UMI} = $umicount{$id};

		$percell{ $barcodes{$id} } = \%data;
		}
		}
		foreach my $barcode (sort keys %percell) {
		my %barcode = %{ $percell{$barcode} };
		if (!$barcode{UMI}) {
		$barcode{UMI} = 0;
		}
		$percell{$barcode} = \%barcode;
		}

		#Gene count per cell
		print "getting gene count per cell.\n";

		my @genecount = @umicount;
		my %genecount;
		foreach my $entry (@genecount) {
		my @data = split (/ /, $entry);
		if (!exists $genecount{$data[0]}) {
		$genecount{$data[0]} = 1;
		}
		else {
		$genecount{$data[0]}++;
		}
		}
		foreach my $id (keys %genecount) {
		if (exists $percell{ $barcodes{$id} }) {
		my %data = %{ $percell{ $barcodes{$id} } };
		$data{GENE} = $genecount{$id};

		$percell{ $barcodes{$id} } = \%data;
		}
		}
		foreach my $barcode (sort keys %percell) {
		my %barcode = %{ $percell{$barcode} };
		if (!$barcode{GENE}) {
		$barcode{GENE} = 0;
		}
		$percell{$barcode} = \%barcode;
		}

		#Total gene count
		print "getting total gene count.\n";

		my $genes_total = `zcat $matrix_file \| tail -n +4 \| cut -f1 -d ' ' \| sort \| uniq \| wc -l`;
		chomp $genes_total;

		#Total, means, and medinans
		print "getting total, mean, and medians.\n";

		my @reads_assigned;
		my @reads_mapped;
		my @umis;
		my @genes;
		foreach my $barcode (keys %percell) {
		my %barcode = %{ $percell{$barcode} };
		push (@reads_assigned, $barcode{ASSIGNED});
		push (@reads_mapped, $barcode{MAPPED});
		push (@umis, $barcode{UMI});
		push (@genes, $barcode{GENE});
		}

		my $reads_assigned_total = sum (@reads_assigned);
		my $reads_assigned_mean = &mean (@reads_assigned);
		my $reads_assigned_median = &median (@reads_assigned);

		my $reads_mapped_total = sum (@reads_mapped);
		my $reads_mapped_mean = &mean (@reads_mapped);
		my $reads_mapped_median = &median (@reads_mapped);

		my $umis_total = sum (@umis);
		my $umis_mean = &mean (@umis);
		my $umis_median = &median (@umis);

		my $genes_mean = &mean (@genes);
		my $genes_median = &median (@genes);

		#getting percentages
		print "getting percentages.\n";

		my $percent_assigned = sprintf("%.4f", $reads_assigned_total/$readcount_raw)*100;
		$percent_assigned = $percent_assigned."\%";
		my $percent_mapped = sprintf("%.4f", $reads_mapped_total/$reads_assigned_total)*100;
		$percent_mapped = $percent_mapped."\%";
		my $reads_per_umi = sprintf("%.2f", $reads_mapped_total/$umis_total);

		#output
		print "printing out summary.\n";

		open (OUT, ">$out_file") or die "cannot open $out_file.\n";
		print OUT "\#\#\#\#\#BASIC STATS\#\#\#\#\#\n";
		print OUT "Reference gene count:\t$ref_gene_count\n";
		print OUT "Cell count:\t(raw)$cellcount_raw\t(filtered)$cellcount_filtered\n";
		print OUT "Read count(raw):\t$readcount_raw\n";
		print OUT "Read count(assigned):\t(total)$reads_assigned_total\t(per cell mean)$reads_assigned_mean\t(per cell median)$reads_assigned_median\n";
		print OUT "Read count(mapped):\t(total)$reads_mapped_total\t(per cell mean)$reads_mapped_mean\t(per cell median)$reads_mapped_median\n";
		print OUT "UMI count(mapped to gene):\t(total)$umis_total\t(per cell mean)$umis_mean\t(per cell median)$umis_median\n";
		print OUT "Gene count:\t(total)$genes_total\t(per cell mean)$genes_mean\t(per cell median)$genes_median\n";
		print OUT "\%Reads assigned/raw:\t$percent_assigned\n";
		print OUT "\%Reads mapped/assigned:\t$percent_mapped\n";
		print OUT "reads/UMI:\t$reads_per_umi\n";
		print OUT "\n";
		print OUT "\#\#\#\#\#PER CELL\#\#\#\#\#\n";
		print OUT "barcode\tRead count (assigned)\tRead count (mapped)\tUMI count(mapped)\tGene count\n";
		foreach my $barcode (sort keys %percell) {
		my %barcode = %{ $percell{$barcode} };
		print OUT "$barcode\t$barcode{ASSIGNED}\t$barcode{MAPPED}\t$barcode{UMI}\t$barcode{GENE}\n";
		}
		close (OUT);
		##########



		#####SUB#####
		sub mean {
		my $raw = sum(@_)/@_;
		my $round = sprintf("%.1f", $raw);
		return $round;
		}

		sub median {
		my @vals = sort {$a <=> $b} @_;
		my $len = @vals;
		if($len%2) {
		return $vals[int($len/2)];
		}
		else {
		return ($vals[int($len/2)-1] + $vals[int($len/2)])/2;
		}
		}
		##########
		__END__

launch_universc.sh

+32 −7

Original line number	Diff line number	Diff line
		@@ -3,7 +3,7 @@
		install=false

		######convert version#####
		convertversion="0.3.0.90003"
		convertversion="0.3.0.90004"
		##########


		@@ -41,6 +41,8 @@ if [[ $RDIR != $SDIR ]]; then
		echo "DIR '$RDIR' resolves to '$SDIR'"
		fi
		echo "Running launch_universc.sh in '$SDIR'"

		PERCELLSTATS=${SDIR}/ExtractBasicStats.pl
		##########


		@@ -59,7 +61,6 @@ Usage:
		Convert sequencing data (FASTQ) from Nadia or iCELL8 platforms for compatibility with 10x Genomics and run cellranger count

		Mandatory arguments to long options are mandatory for short options too.
		-s, --setup Set up whitelists for compatibility with new technology
		-t, --technology PLATFORM Name of technology used to generate data (10x, nadia, icell8, or custom)
		e.g. custom_16_10
		-R1, --read1 FILE Read 1 FASTQ file to pass to cellranger (cell barcodes and umi)
		@@ -67,9 +68,11 @@ Mandatory arguments to long options are mandatory for short options too.
		-f, --file NAME Path and the name of FASTQ files to pass to cellranger (prefix before R1 or R2)
		e.g. /path/to/files/Example_S1_L001
		-b, --barcodefile FILE Custom barcode list in plain text

		-i, --id ID A unique run id, used to name output folder
		-d, --description TEXT Sample description to embed in output files.
		-r, --reference DIR Path of directory containing 10x-compatible reference.

		-c, --chemistry CHEM Assay configuration, autodetection is not possible for converted files: 'SC3Pv2' (default), 'SC5P-PE', or 'SC5P-R2'
		-n, --force-cells NUM Force pipeline to use this number of cells, bypassing the cell detection algorithm.
		-j, --jobmode MODE Job manager to use. Valid options: 'local' (default), 'sge', 'lsf', or a .template file
		@@ -79,7 +82,12 @@ Mandatory arguments to long options are mandatory for short options too.
		Only applies when --jobmode=local.
		--mempercore NUM Set max GB each job may use at one time.
		Only applies in cluster jobmodes.
		-p, --pass Skips the FASTQ file conversion if converted files already exist

		-p, --per-cell-data Generates a file with basic run statistics along with per-cell data

		-s, --setup Set up whitelists for compatibility with new technology
		-w, --whitelist-as-is Skips the FASTQ file conversion if converted files already exist

		-h, --help Display this help and exit
		-v, --version Output version information and exit
		--verbose Print additional outputs for debugging
		@@ -112,6 +120,7 @@ lastcall=`[ -e $lastcallfile ] && cat $lastcallfile \|\| echo ""`
		barcodedir=${cellrangerpath}-cs/${cellrangerversion}/lib/python/cellranger/barcodes #folder within cellranger with the whitelist barcodes
		barcodefile=""
		crIN=input4cellranger #name of the directory with all FASTQ files given to cellranger
		percellfile="outs/basic_stats.txt"

		#variable options
		setup=false
		@@ -129,6 +138,7 @@ chemistry=""
		jobmode=""
		ncores=""
		mem=""
		percelldata=false

		next=false
		for op in "$@"; do
		@@ -324,7 +334,12 @@ for op in "$@"; do
		exit 1
		fi
		;;
		-p\|--pass)
		-p\|--per-cell-data)
		percelldata=true
		next=false
		shift
		;;
		-w\|--whitelist-as-is)
		convert=false
		next=false
		shift
		@@ -397,6 +412,7 @@ if [[ "$technology" != "10x" ]] && [[ "$technology" != "nadia" ]] && [[ "$techno
		echo "Error: when option -t is set as custom, a file with a list of barcodes needs to be specified with option -b."
		exit 1
		fi
		setup=true
		fi
		fi

		@@ -643,7 +659,6 @@ elif ! [[ $mem =~ $int ]] && [[ $setup == "false" ]]; then
		exit 1
		fi


		#check if chemistry matches expected input
		if [[ -z "$chemistry" ]]; then
		chemistry="SC3Pv2"
		@@ -1119,6 +1134,16 @@ fi



		#####extracting per cell data#####
		if [[ $percelldata == true ]]; then
		echo "generating basic run statistics and per cell data"
		perl ${PERCELLSTATS} ${id}
		echo "per cell data generated"
		fi
		##########



		#####printing out log#####
		log="
		#####Conversion tool log#####
		@@ -1126,7 +1151,7 @@ cellranger ${cellrangerversion}

		Original barcode format: ${technology} (then converted to 10x)

		Runtime: ${runtime}s
		cellranger runtime: ${runtime}s
		##########
		"
		echo "$log"

Admin message