1. seperate visualization step; 2. adjust summary format; 3. add pand…

…oc check before running pipeline
Griffan · Feb 18, 2020 · 7788b88 · 7788b88
1 parent 1e977e1
commit 7788b88
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 10 deletions.
diff --git a/Readme.md b/Readme.md
@@ -19,7 +19,7 @@
 ### QUICK START
 To simplify the whole process, we prepared a one-stop script to process the whole pipeline or choose start point of the pipeline([] is optional)
 ```
-bin/FASTQuick.sh --steps <All|AllButIndex|Index|Align|Contamination|Ancestry> \
+bin/FASTQuick.sh --steps <All|AllButIndex|Index|Align|Contamination|Ancestry|Visualize> \
 --candidateVCF <variant list> \
 --reference <reference.fa> \
 --output <output.prefix> \

diff --git a/bin/FASTQuick_template.sh b/bin/FASTQuick_template.sh
@@ -19,7 +19,7 @@ if [[ ${PIPESTATUS[0]} -ne 4 ]]; then
     exit 1
 fi
 USAGE_MESSAGE="
-Usage: FASTQuick.sh [--steps All|AllButIndex|Index|Align|Contamination|Ancestry] --candidateVCF <1000g.phase3.site.vcf> --reference <reference.fa> --output <output.prefix> --index <index.prefix> --dbSNP <dbSNP.vcf.gz> --fastqList <one_pair_of_fq_or_single_fq_per_line> [--workingdir <directory>] [--callableRegion <callableRegion.bed>] [--targetRegion <targetRegion.bed>]
+Usage: FASTQuick.sh [--steps All|AllButIndex|Index|Align|Contamination|Ancestry|Visualize] --candidateVCF <1000g.phase3.site.vcf> --reference <reference.fa> --output <output.prefix> --index <index.prefix> --dbSNP <dbSNP.vcf.gz> --fastqList <one_pair_of_fq_or_single_fq_per_line> [--workingdir <directory>] [--callableRegion <callableRegion.bed>] [--targetRegion <targetRegion.bed>]
 
 	-l/--candidateVCF:  VCF format candidate variant list to choose from.
 	-r/--reference: reference genome fasta file to use.
@@ -113,6 +113,7 @@ done
 do_index=false
 do_align=false
 do_cont_anc=false
+do_viz=false
 #echo "--steps:$steps"
 if [[ "$steps" == *"allbutindex"* ]] ; then
 	do_align=true
@@ -122,14 +123,20 @@ elif [[ "$steps" == *"all"* ]] ; then
 	do_index=true
 	do_align=true
 	do_cont_anc=true
+	do_viz=true
 elif [[ "$steps" == *"index"* ]] ; then
 	do_index=true
 elif [[ "$steps" == *"align"* ]] ; then
 	do_align=true
+	do_viz=true
 elif [[ "$steps" == *"contamination"* ]] ; then
 	do_cont_anc=true
+	do_viz=true
 elif [[ "$steps" == *"ancestry"* ]] ; then
 	do_cont_anc=true
+	do_viz=true
+elif [[ "$steps" == *"visualize"* ]] ; then
+	do_viz=true
 fi
 
 
@@ -226,8 +233,9 @@ fi
 
 
 # Validate tools exist on path
-for tool in sort bcftools; do
-	if ! which $tool >/dev/null; then echo "Error: unable to find $tool on \$PATH" 1>&2 ; exit 2; fi
+for tool in sort bcftools pandoc; do
+	if ! which $tool >/dev/null; then
+	  echo "Error: unable to find $tool on \$PATH, please install before continue" 1>&2 ; exit 2; fi
 	echo "Found $(which $tool)" 1>&2
 done
 
@@ -373,10 +381,12 @@ if [[ $do_cont_anc == true ]] ; then
 		echo "$(date)	Failed to load eigen space files:${indexPrefix}.FASTQuick.fa.bed.phase3.vcf.gz.UD" | tee -a $timinglogfile
 	fi
 	echo "$(date)	Complete estimating contamination and genetic ancestry" | tee -a $timinglogfile
+else
+	echo "$(date)	Skipping estimating contamination and genetic ancestry..."| tee -a $timinglogfile
 fi
 
-if [[ -f "${outputPrefix}.Summary" ]] ; then
-echo "$(date)	Summarize basic	QC statistics..." | tee -a $timinglogfile
+if [[ $do_viz == true ]] ; then
+echo "$(date)	Visualize QC statistics..." | tee -a $timinglogfile
 { /usr/bin/time \
 	Rscript ${FASTQuick_BIN_DIR}/RPlotScript.R \
 	${outputPrefix} \

diff --git a/bin/FinalReport.rmd b/bin/FinalReport.rmd
@@ -212,8 +212,8 @@ q6=ggplot(Combined.Table,aes(x=InsertSize,y=Frequency,colour=Category))+geom_lin
 
 mydata <- scan(paste(input,".Summary",sep=""), what="", sep="\n")
 fileLen=length(mydata)-1#minus contamination line
-ExpectedDepth=as.numeric(strsplit(mydata[fileLen-13]," ")[[1]][5])
-EstimatedDepth=as.numeric(strsplit(mydata[fileLen-12]," ")[[1]][4])
+ExpectedDepth=as.numeric(strsplit(strsplit(mydata[fileLen-13]," ")[[1]][5], "\\[")[[1]][1])
+EstimatedDepth=as.numeric(strsplit(strsplit(mydata[fileLen-12]," ")[[1]][4], "\\[")[[1]][1])
 
 AccessibleFraction=as.numeric(strsplit(strsplit(mydata[fileLen-11]," ")[[1]][8],"%")[[1]][1])
 EstimatedQ20Depth=as.numeric(strsplit(mydata[fileLen-3],"\\s+|:")[[1]][8])

diff --git a/src/StatCollector.cpp b/src/StatCollector.cpp
@@ -2314,7 +2314,7 @@ int StatCollector::SummaryOutput(const string &outputPath) {
   ofstream fout(outputPath + ".Summary");
   fout << "Statistics : " << "Value\n";
   fout << "Expected Read Depth : " << (double)total_base / ref_genome_size
-       << " [" << total_base << "/" << ref_genome_size << "]\n";
+       << "[" << total_base << "/" << ref_genome_size << "]\n";
   /*auto AvgDepth =
       [&]()->double
   {	long long tmp(0); for (size_t i = 0; i != DepthDist.size(); ++i) tmp +=
@@ -2323,7 +2323,7 @@ int StatCollector::SummaryOutput(const string &outputPath) {
   fout << ((NumPositionCovered == 0)
                ? 0
                : NumBaseMapped / (double)total_region_size)
-       << " [" << NumBaseMapped << "/" << total_region_size <<"]\n";
+       << "[" << NumBaseMapped << "/" << total_region_size <<"]\n";
   fout << "Estimated Percentage of Accessible Genome Covered : "
        << (1. - (double)DepthDist[0] / total_region_size) * 100 << "%\n";
   // output for fraction figure