diff --git a/README.md b/README.md index cfb370a..3c67244 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# backmap.pl v0.1 +# backmap.pl v0.2 ## Description -__Automatic short read mapping and genome size estimate from coverage.__ +__Automatic read mapping and genome size estimate from coverage.__ -Automatic mapping of paired and unpaired reads to an assembly with `bwa mem`, execution of `qualimap bamqc` and estimation of genome size from mapped nucleotides divided by mode (>0). -The tools `bwa` and `samtools` need to be in your `$PATH`. The tools `qualimap`, `bedtools` and `Rscript` are optional but needed to create the mapping quality report, coverage histogram and plot of the coverage distribution respectively. +Automatic mapping of paired, unpaired, PacBio and Nanopore reads to an assembly with `bwa mem` or `minimap2`, execution of `qualimap bamqc` and estimation of genome size from mapped nucleotides divided by mode (>0). +The tools `samtools`, `bwa` and/or `minimap2` need to be in your `$PATH`. The tools `qualimap`, `multiqc`, `bedtools` and `Rscript` are optional but needed to create the mapping quality report, coverage histogram and plot of the coverage distribution respectively. ## Dependencies @@ -12,11 +12,17 @@ The tools `bwa` and `samtools` need to be in your `$PATH`. The tools `qualimap`, Mandatory: - [Number::FormatEng](https://metacpan.org/pod/Number::FormatEng) +- [samtools](https://github.com/samtools/samtools): `samtools` + +Short read mapping: - [bwa (mem)](https://github.com/lh3/bwa): `bwa` -- [samtools](https://github.com/samtools/samtools): `samtools` + +Long read mapping: +- [minimap2](https://github.com/lh3/minimap2): `minimap2` Optional: - [Qaulimap](http://qualimap.bioinfo.cipf.es/): `qualimap` +- [MultiQC](https://multiqc.info/): `multiqc` - [bedtools](https://bedtools.readthedocs.io/en/latest/) `bedtools` - [Rscript](https://www.r-project.org/) `Rscript` @@ -24,35 +30,41 @@ Optional: ``` backmap.pl [-a {-p , | -u } | - -b ] + -pb | -ont } | -b ] Mandatory: -a STR Assembly were reads should mapped to in fasta format - -p STR Two files with paired reads comma sperated - Can be specified multiple times - -u STR One file with unpaired reads - Can be specified multiple times + AND AT LEAST ONE OF + -p STR Two files with paired Illumina reads comma sperated + -u STR Fastq file with unpaired Illumina reads + -pb STR Fasta or fastq file with PacBio reads + -ont STR Fasta or fastq file with Nanopore reads OR -b STR Bam file to calculate coverage from Skips read mapping - Overrides -nh and -ne + Overrides -nh + Technologies will recognized correctly if filenames end with + .pb(.sort).bam or .ont(.sort).bam for PacBio and Nanopore respectively. + Otherwise they are assumed to be from Illumina. + + All mandatory options except of -a can be specified multiple times Options: [default] -o STR Output directory [.] Will be created if not existing -t INT Number of parallel executed processes [1] Affects bwa mem, samtools sort, qualimap bamqc - -pre STR Prefix of output files [filename of -a or -b] - -sort Sort the bam file (-b) [off] + -pre STR Prefix of output files if -a is used [filename of -a] + -sort Sort the bam file(s) (-b) [off] -nq Do not run qualimap bamqc [off] -nh Do not create coverage histogram [off] Implies -ne -ne Do not estimate genome size [off] -kt Keep temporary bam files [off] -bo STR Options passed to bwa [-a -c 10000] - Pass options with -bo "" + -mo STR Options passed to minimap [PacBio: -H -x map-pb; ONT: -x map-ont] -qo STR Options passed to qualimap [none] - Pass options with -qo "" + Pass options with quotes e.g. -bo "" -v Print executed commands to STDERR [off] -dry-run Only print commands to STDERR instead of executing [off] @@ -61,4 +73,4 @@ Options: [default] ``` ## Citation -If you use this tool please cite the dependencies. +If you use this tool please cite the dependencies as well. diff --git a/backmap.pl b/backmap.pl index 869e900..e50a066 100755 --- a/backmap.pl +++ b/backmap.pl @@ -6,46 +6,49 @@ use IPC::Cmd qw[can_run run]; use Number::FormatEng qw(:all); -my $version = "0.1"; +my $version = "0.2"; sub print_help{ print STDOUT "\n"; print STDOUT "backmap.pl v$version\n"; print STDOUT "\n"; print STDOUT "Description:\n"; - print STDOUT "\tAutomatic mapping of paired and unpaired reads to an assembly, execution of\n\tqualimap bamqc and estimation of genome size from mapped nucleotides and\n\tpeak coverage.\n\tThe tools bwa, samtools, qualimap and bedtools need to be in your \$PATH.\n"; + print STDOUT "\tAutomatic mapping of paired, unpaired, PacBio and Nanopore reads to an\n\tassembly, execution of qualimap bamqc, multiqc and estimation of genome size\n\tfrom mapped nucleotides and peak coverage.\n\tThe tools bwa, minimap2, samtools, qualimap, multiqc, bedtools and Rscript need to be\n\tin your \$PATH.\n"; print STDOUT "\n"; print STDOUT "Usage:\n"; - print STDOUT "\tbackmap.pl [-a {-p , | -u } |\n"; - print STDOUT "\t -b ]\n"; + print STDOUT "\tbackmap.pl [-a {-p , | -u |\n"; + print STDOUT "\t -pb | -ont } | -b ]\n"; print STDOUT "\n"; print STDOUT "Mandatory:\n"; print STDOUT "\t-a STR\t\tAssembly were reads should mapped to in fasta format\n"; - print STDOUT "\t-p STR\t\tTwo files with paired reads comma sperated\n"; - print STDOUT "\t\t\tCan be specified multiple times\n"; - print STDOUT "\t-u STR\t\tOne file with unpaired reads\n"; - print STDOUT "\t\t\tCan be specified multiple times\n"; + print STDOUT "\tAND AT LEAST ONE OF\n"; + print STDOUT "\t-p STR\t\tTwo fastq files with paired Illumina reads comma sperated\n"; + print STDOUT "\t-u STR\t\tFastq file with unpaired Illumina reads\n"; + print STDOUT "\t-pb STR\t\tFasta or fastq file with PacBio reads\n"; + print STDOUT "\t-ont STR\tFasta or fastq file with Nanopore reads\n"; print STDOUT "\tOR\n"; print STDOUT "\t-b STR\t\tBam file to calculate coverage from\n"; print STDOUT "\t\t\tSkips read mapping\n"; print STDOUT "\t\t\tOverrides -nh\n"; + print STDOUT "\t\t\tTechnologies will recognized correctly if filenames end with\n\t\t\t.pb(.sort).bam or .ont(.sort).bam for PacBio and Nanopore respectively.\n\t\t\tOtherwise they are assumed to be from Illumina.\n"; + print STDOUT "\tAll mandatory options except of -a can be specified multiple times\n"; print STDOUT "\n"; print STDOUT "Options: [default]\n"; print STDOUT "\t-o STR\t\tOutput directory [.]\n"; print STDOUT "\t\t\tWill be created if not existing\n"; print STDOUT "\t-t INT\t\tNumber of parallel executed processes [1]\n"; print STDOUT "\t\t\tAffects bwa mem, samtools sort, qualimap bamqc\n"; - print STDOUT "\t-pre STR\tPrefix of output files [filename of -a or -b]\n"; - print STDOUT "\t-sort\t\tSort the bam file (-b) [off]\n"; + print STDOUT "\t-pre STR\tPrefix of output files if -a is used [filename of -a]\n"; + print STDOUT "\t-sort\t\tSort the bam file(s) (-b) [off]\n"; print STDOUT "\t-nq\t\tDo not run qualimap bamqc [off]\n"; print STDOUT "\t-nh\t\tDo not create coverage histogram [off]\n"; print STDOUT "\t\t\tImplies -ne\n"; print STDOUT "\t-ne\t\tDo not estimate genome size [off]\n"; print STDOUT "\t-kt\t\tKeep temporary bam files [off]\n"; print STDOUT "\t-bo STR\t\tOptions passed to bwa [-a -c 10000]\n"; - print STDOUT "\t\t\tPass options with -bo \"\"\n"; + print STDOUT "\t-mo STR\t\tOptions passed to minimap [PacBio: -H -x map-pb; ONT: -x map-ont]\n"; print STDOUT "\t-qo STR\t\tOptions passed to qualimap [none]\n"; - print STDOUT "\t\t\tPass options with -qo \"\"\n"; + print STDOUT "\tPass options with quotes e.g. -bo \"\"\n"; print STDOUT "\t-v\t\tPrint executed commands to STDERR [off]\n"; print STDOUT "\t-dry-run\tOnly print commands to STDERR instead of executing [off]\n"; print STDOUT "\n"; @@ -64,25 +67,40 @@ sub exe_cmd{ } } +sub round_format_pref{ + my ($number) = @_; + $number = format_pref($number); + my $format_number = substr($number,0,-1); + $format_number = sprintf("%.2f", $format_number); + my $pref = substr($number,-1); + my $return = $format_number . $pref; + return $return; +} + my $out_dir = abs_path("./"); my $assembly_path = ""; my $assembly = ""; -#my $assembly_link = ""; my @paired = (); my @unpaired = (); +my @pb = (); +my @ont = (); my $threads = 1; my $prefix = ""; my $verbose = 0; my $bwa_opts = "-a -c 10000 "; +my $minimap_opts = ""; my $qm_opts = ""; my $create_histo_switch = 1; my $estimate_genome_size_switch = 1; my $run_bamqc_switch = 1; my $keep_tmp = 0; my $dry = 0; -my $bam = ""; +my @bam = (); my $sort_bam_switch = 0; my $cmd; +my $mkdir_cmd; +my $home = `echo \$HOME`; +chomp $home; my $input_error = 0; @@ -95,6 +113,10 @@ sub exe_cmd{ $out_dir = abs_path($ARGV[$i+1]); } if ($ARGV[$i] eq "-a"){ + if($assembly ne ""){ + print STDERR "ERROR\tSpecify -a just once\n"; + $input_error = 1; + } $assembly = (split /\//,$ARGV[$i+1])[-1]; $assembly_path = abs_path($ARGV[$i+1]); } @@ -104,8 +126,14 @@ sub exe_cmd{ if ($ARGV[$i] eq "-u"){ push(@unpaired,$ARGV[$i+1]); } + if ($ARGV[$i] eq "-pb"){ + push(@pb,$ARGV[$i+1]); + } + if ($ARGV[$i] eq "-ont"){ + push(@ont,$ARGV[$i+1]); + } if ($ARGV[$i] eq "-b"){ - $bam = abs_path($ARGV[$i+1]); + push(@bam,$ARGV[$i+1]); } if ($ARGV[$i] eq "-sort"){ $sort_bam_switch = 1; @@ -123,6 +151,10 @@ sub exe_cmd{ $bwa_opts = $ARGV[$i+1] . " "; #nonsense flags are skipped from bwa $ARGV[$i+1] = "\'$ARGV[$i+1]\'"; } + if ($ARGV[$i] eq "-mo"){ + $minimap_opts = $ARGV[$i+1] . " "; + $ARGV[$i+1] = "\'$ARGV[$i+1]\'"; + } if ($ARGV[$i] eq "-qo"){ $qm_opts = $ARGV[$i+1] . " "; #nonsense flags are skipped from qualimap $ARGV[$i+1] = "\'$ARGV[$i+1]\'"; @@ -155,20 +187,28 @@ sub exe_cmd{ print STDERR "CMD\t" . $0 . " " . join(" ",@ARGV) . "\n"; -if($assembly_path ne "" and $bam ne ""){ +if($assembly_path ne "" and scalar(@bam) > 0){ print STDERR "ERROR\tSpecify either -a or -b\n"; $input_error = 1; } -if($assembly_path eq "" and $bam eq ""){ +if($assembly_path eq "" and scalar(@bam) == 0){ print STDERR "ERROR\tSpecify either -a or -b\n"; $input_error = 1; } -if($assembly_path ne "" and $bam eq ""){ - if(not defined(can_run("bwa"))){ - print STDERR "ERROR\tbwa is not in your \$PATH\n"; - $input_error = 1; +if($assembly_path ne "" and scalar(@bam) == 0){ + if(scalar(@paired) > 0 or scalar(@unpaired) > 0){ + if(not defined(can_run("bwa"))){ + print STDERR "ERROR\tbwa is not in your \$PATH\n"; + $input_error = 1; + } + } + if(scalar(@pb) > 0 or scalar(@ont) > 0){ + if(not defined(can_run("minimap2"))){ + print STDERR "ERROR\tminimap2 is not in your \$PATH\n"; + $input_error = 1; + } } if(not defined(can_run("samtools"))){ print STDERR "ERROR\tsamtools is not in your \$PATH\n"; @@ -176,7 +216,7 @@ sub exe_cmd{ } } -if(not defined(can_run("qualimap"))){ +if(not defined(can_run("qualimap")) and $run_bamqc_switch == 1){ print STDERR "INFO\tqualimap is not in your \$PATH and will not be executed\n"; } @@ -186,7 +226,7 @@ sub exe_cmd{ print STDERR "WARNING\tGenome size estimation not possible\n"; $estimate_genome_size_switch = 0; } - if($bam ne ""){ + if(scalar(@bam) > 0){ print STDERR "ERROR\tGenome size estimation not possible\n"; $input_error = 1; } @@ -209,19 +249,12 @@ sub exe_cmd{ print STDERR "ERROR\tFile $assembly_path does not exist!\n"; $input_error = 1; } - if(scalar(@paired) == 0 and scalar(@unpaired) == 0){ + if(scalar(@paired) == 0 and scalar(@unpaired) == 0 and scalar(@pb) == 0 and scalar(@ont) == 0){ print STDERR "ERROR\tNo reads specified!\n"; $input_error = 1; } } -if($bam ne ""){ - if(not -f $bam){ - print STDERR "ERROR\tFile $bam does not exist!\n"; - $input_error = 1; - } -} - if($threads !~ m/^\d+$/ or $threads < 1){ print STDERR "ERROR\tThreads is no integer >= 1!\n"; $input_error = 1; @@ -232,26 +265,34 @@ sub exe_cmd{ exit 1; } +my $samtools_threads = $threads - 1; + if(not -d "$out_dir"){ print STDERR "INFO\tCreating output directory $out_dir\n"; - $cmd="mkdir -p $out_dir"; - exe_cmd($cmd,$verbose,$dry); + $mkdir_cmd="mkdir -p $out_dir"; +# exe_cmd($cmd,$verbose,$dry); } if($prefix eq ""){ if($assembly ne ""){ $prefix = $assembly; + print STDERR "INFO\tSetting prefix to $prefix\n"; } - if($bam ne ""){ - $prefix = (split /\//,$bam)[-1]; + else{ + print STDERR "INFO\tSetting prefix to name(s) of corresponding bam\n"; } - print STDERR "INFO\tSetting prefix to $prefix\n"; } -if($bam ne "" and defined(can_run("bedtools"))){ - if($create_histo_switch == 0 or $estimate_genome_size_switch == 0){ - $create_histo_switch = 1; - print STDERR "INFO\tCreating coverage histogram\n"; +if(scalar(@bam) > 0){ + if($prefix ne ""){ + print STDERR "INFO\tIgnoring -pre $prefix\n"; + $prefix = ""; + } + if(defined(can_run("bedtools"))){ + if($create_histo_switch == 0 or $estimate_genome_size_switch == 0){ + $create_histo_switch = 1; + print STDERR "INFO\tCreating coverage histogram\n"; + } } } @@ -260,6 +301,11 @@ sub exe_cmd{ if($assembly_path ne ""){ foreach(@paired){ my @pair = split(/,/,$_); + foreach(@pair){ + if($_ =~ m/^~/){ + $_ =~ s/^~/$home/; #~ is translated by bash into $HOME. This does not work if there is no space infront. That means if the second file starts with "~" it will not be recognized even though it exists + } + } if(scalar(@pair) != 2){ print STDERR "INFO\tNot a pair: $_ - skipping these file(s)\n"; } @@ -303,27 +349,104 @@ sub exe_cmd{ } } +my %pb_filter; + +if($assembly_path ne ""){ + foreach(@pb){ + if(not -f "$_"){ + print STDERR "INFO\tNo file $_ - skipping this file\n"; + } + else{ + if(exists($pb_filter{abs_path($_)})){ + print STDERR "INFO\tFile " . abs_path($_) . " already specified\n"; + } + else{ + $pb_filter{abs_path($_)} = 1; + } + } + } +} + +my %ont_filter; + +if($assembly_path ne ""){ + foreach(@ont){ + if(not -f "$_"){ + print STDERR "INFO\tNo file $_ - skipping this file\n"; + } + else{ + if(exists($ont_filter{abs_path($_)})){ + print STDERR "INFO\tFile " . abs_path($_) . " already specified\n"; + } + else{ + $ont_filter{abs_path($_)} = 1; + } + } + } +} + +my %bam_filter; + +if(scalar(@bam) > 0){ + foreach(@bam){ + if(not -f "$_"){ + print STDERR "INFO\tNo file $_ - skipping this file\n"; + } + else{ + if(exists($bam_filter{abs_path($_)})){ + print STDERR "INFO\tFile " . abs_path($_) . " already specified\n"; + } + else{ + $bam_filter{abs_path($_)} = 1; + } + } + } +} + if($assembly_path ne ""){ - if(scalar(keys(%paired_filter)) == 0 and scalar(keys(%unpaired_filter)) == 0){ + if(scalar(keys(%paired_filter)) == 0 and scalar(keys(%unpaired_filter)) == 0 and scalar(keys(%pb_filter)) == 0 and scalar(keys(%ont_filter)) == 0){ print STDERR "ERROR\tNo existing read files specified!\n"; exit 1; } } +else{ + if(scalar(keys(%bam_filter)) == 0){ + print STDERR "ERROR\tNo existing bam files specified!\n"; + exit 1; + } +} my $index_path = $out_dir . "/" . $assembly; if($assembly_path ne ""){ - if(not -f $assembly_path . ".amb" or not -f $assembly_path . ".ann" or not -f $assembly_path . ".bwt" or not -f $assembly_path . ".pac" or not -f $assembly_path . ".sa"){ - $cmd = "bwa index -p $index_path $assembly_path > $out_dir/$prefix\_bwa_index.log 2> $out_dir/$prefix\_bwa_index.err"; - } - else{ - print STDERR "INFO\tIndex files already existing\n"; - $index_path = $assembly_path; + if(scalar(keys(%paired_filter)) > 0 or scalar(keys(%unpaired_filter)) > 0){ + if(not -f $assembly_path . ".amb" or not -f $assembly_path . ".ann" or not -f $assembly_path . ".bwt" or not -f $assembly_path . ".pac" or not -f $assembly_path . ".sa"){ + $cmd = "bwa index -p $index_path $assembly_path > $out_dir/$prefix\_bwa_index.log 2> $out_dir/$prefix\_bwa_index.err"; + } + else{ + print STDERR "INFO\tIndex files already existing\n"; + $index_path = $assembly_path; + } } } -my $bwa_version = `bwa 2>&1 | head -3 | tail -1 | sed 's/^Version: //'`; -chomp $bwa_version; +my $bwa_version; +if(not defined(can_run("bwa"))){ + $bwa_version = "not detected"; +} +else{ + $bwa_version = `bwa 2>&1 | head -3 | tail -1 | sed 's/^Version: //'`; + chomp $bwa_version; +} + +my $minimap_version; +if(not defined(can_run("minimap2"))){ + $minimap_version = "not detected"; +} +else{ + $minimap_version = `minimap2 --version`; + chomp $minimap_version; +} my $samtools_version = `samtools --version | head -1 | sed 's/^samtools //'`; chomp $samtools_version; @@ -380,10 +503,13 @@ sub exe_cmd{ $estimate_genome_size_switch_word = "No"; } +print "\n"; +print "backmap.pl v$version\n"; print "\n"; print "Detected tools\n"; print "==============\n"; print "bwa: " . $bwa_version . "\n"; +print "minimap2: " . $minimap_version . "\n"; print "samtools: " . $samtools_version . "\n"; print "qualimap: " . $qualimap_version . "\n"; print "bedtools: " . $bedtools_version . "\n"; @@ -398,9 +524,14 @@ sub exe_cmd{ print join("\n ",keys(%paired_filter)) . "\n"; print "Unpaired reads: "; print join("\n ",keys(%unpaired_filter)) . "\n"; + print "PacBio reads: "; + print join("\n ",keys(%pb_filter)) . "\n"; + print "Nanopore reads: "; + print join("\n ",keys(%ont_filter)) . "\n"; } -if($bam ne ""){ - print "Bam file: " . $bam . "\n"; +if(scalar(keys(%bam_filter)) > 0){ + print "Bam files: "; + print join("\n ",keys(%bam_filter)) . "\n"; } print "Number of threads: " . $threads . "\n"; print "Outpufile prefix: " . $prefix . "\n"; @@ -408,6 +539,7 @@ sub exe_cmd{ print "Keep temporary files: " . $keep_tmp_word . "\n"; if($assembly_path ne ""){ print "bwa options: " . $bwa_opts . "\n"; + print "minimap2 options: " . $minimap_opts . "\n"; } print "Run qualimap bamqc: " . $run_bamqc_switch_word . "\n"; if($run_bamqc_switch == 1){ @@ -416,6 +548,10 @@ sub exe_cmd{ print "Create cov histo: " . $create_histo_switch_word . "\n"; print "Estimate genome size: " . $estimate_genome_size_switch_word . "\n"; +if(defined $mkdir_cmd){ + exe_cmd($mkdir_cmd,$verbose,$dry); +} + if(defined $cmd){ exe_cmd($cmd,$verbose,$dry); } @@ -441,134 +577,289 @@ sub exe_cmd{ push(@unpaired_bam,"$out_dir/$prefix.unpaired$unpaired_counter.bam"); } -my $merged_bam_file; +my $pb_counter = 0; +my @pb_bam = (); +foreach(keys(%pb_filter)){ + $pb_counter++; + $cmd = "minimap2 $minimap_opts-H -x map-pb -a -t $threads $assembly_path $_ 2> $out_dir/$prefix\_minimap_pb$pb_counter.err | samtools view -1 -b - > $out_dir/$prefix.pb$pb_counter.bam"; + exe_cmd($cmd,$verbose,$dry); + push(@pb_bam,"$out_dir/$prefix.pb$pb_counter.bam"); +} + +my $ont_counter = 0; +my @ont_bam = (); +foreach(keys(%ont_filter)){ + $ont_counter++; + $cmd = "minimap2 $minimap_opts-x map-ont -a -t $threads $assembly_path $_ 2> $out_dir/$prefix\_minimap_ont$ont_counter.err| samtools view -1 -b - > $out_dir/$prefix.ont$ont_counter.bam"; + exe_cmd($cmd,$verbose,$dry); + push(@ont_bam,"$out_dir/$prefix.ont$ont_counter.bam"); +} + +my @merged_bam_file = (); if($assembly_path ne ""){ - my $bam_count = scalar(@paired_bam) + scalar(@unpaired_bam); + my $ill_bam_count = scalar(@paired_bam) + scalar(@unpaired_bam); my $paired_bam_files = join(" ",@paired_bam); my $unpaired_bam_files = join(" ",@unpaired_bam); - $merged_bam_file = $out_dir . "/" . $prefix . ".bam"; + my $pb_bam_files = join(" ",@pb_bam); + my $ont_bam_files = join(" ",@ont_bam); - if($bam_count == 1){ - my $single_bam = join(" ",@paired_bam,@unpaired_bam); - $single_bam =~ s/^\s+//; - $single_bam =~ s/\s+$//; - $cmd = "ln -s $single_bam $merged_bam_file"; - exe_cmd($cmd,$verbose,$dry); + if($ill_bam_count > 0){ + if($ill_bam_count == 1){ + my $single_bam = join(" ",@paired_bam,@unpaired_bam); + $single_bam =~ s/^\s+//; + $single_bam =~ s/\s+$//; + $cmd = "ln -s $single_bam $out_dir/$prefix.bam"; + exe_cmd($cmd,$verbose,$dry); + push(@merged_bam_file, "$out_dir/$prefix.bam"); + } + else{ + $cmd = "samtools merge -@ $samtools_threads $out_dir/$prefix.bam $paired_bam_files $unpaired_bam_files"; + exe_cmd($cmd,$verbose,$dry); + push(@merged_bam_file, "$out_dir/$prefix.bam"); + } } - else{ - $cmd = "samtools merge $merged_bam_file $paired_bam_files $unpaired_bam_files"; - exe_cmd($cmd,$verbose,$dry); + + if(scalar(@pb_bam) > 0){ + if(scalar(@pb_bam) == 1){ + my $single_bam = $pb_bam[0]; + $cmd = "ln -s $single_bam $out_dir/$prefix.pb.bam"; + exe_cmd($cmd,$verbose,$dry); + push(@merged_bam_file, "$out_dir/$prefix.pb.bam"); + } + else{ + $cmd = "samtools merge -@ $samtools_threads $out_dir/$prefix.pb.bam $pb_bam_files"; + exe_cmd($cmd,$verbose,$dry); + push(@merged_bam_file, "$out_dir/$prefix.pb.bam"); + } + } + + if(scalar(@ont_bam) > 0){ + if(scalar(@ont_bam) == 1){ + my $single_bam = $ont_bam[0]; + $cmd = "ln -s $single_bam $out_dir/$prefix.ont.bam"; + exe_cmd($cmd,$verbose,$dry); + push(@merged_bam_file, "$out_dir/$prefix.ont.bam"); + } + else{ + $cmd = "samtools merge -@ $samtools_threads $out_dir/$prefix.ont.bam $ont_bam_files"; + exe_cmd($cmd,$verbose,$dry); + push(@merged_bam_file, "$out_dir/$prefix.ont.bam"); + } } + } -my $sorted_bam_file; -if($bam ne ""){ +my @sorted_bams; +if(scalar(keys(%bam_filter)) > 0){ if($sort_bam_switch == 0){ - $sorted_bam_file = $bam; + @sorted_bams = keys(%bam_filter); } if($sort_bam_switch == 1){ - $merged_bam_file = $bam; - $sorted_bam_file = $out_dir . "/" . $prefix . ".sort.bam"; + @merged_bam_file = keys(%bam_filter); } } -if($assembly_path ne ""){ - $sorted_bam_file = $out_dir . "/" . $prefix . ".sort.bam"; -} - if($assembly_path ne "" or $sort_bam_switch == 1){ - $cmd = "samtools sort -l 9 -@ $threads -T $prefix -o $sorted_bam_file $merged_bam_file"; - exe_cmd($cmd,$verbose,$dry); + foreach(@merged_bam_file){ + my $sorted_bam_file = $_; + $sorted_bam_file =~ s/\.bam$/\.sort\.bam/; + $cmd = "samtools sort -l 9 -@ $samtools_threads -T $out_dir/$prefix -o $sorted_bam_file $_"; + exe_cmd($cmd,$verbose,$dry); + push(@sorted_bams,$sorted_bam_file); + } if($keep_tmp == 0){ - my $tmp_bams = join(" ",@paired_bam,@unpaired_bam); - $cmd = "rm $merged_bam_file $tmp_bams"; + my $tmp_bams = join(" ",@paired_bam,@unpaired_bam,@pb_bam,@ont_bam,@merged_bam_file); + $cmd = "rm $tmp_bams"; exe_cmd($cmd,$verbose,$dry); } } if($run_bamqc_switch == 1){ if(defined(can_run("qualimap"))){ - $cmd = "qualimap bamqc $qm_opts-bam $sorted_bam_file -nt $threads > $out_dir/$prefix\_bamqc.log 2> $out_dir/$prefix\_bamqc.err"; - exe_cmd($cmd,$verbose,$dry); + foreach(@sorted_bams){ + my $bamqc_out = $_; + $bamqc_out = (split(/\//,$bamqc_out))[-1]; + $bamqc_out =~ s/\.bam$/_stats/; + $cmd = "qualimap bamqc $qm_opts-bam $_ -nt $threads -outdir $out_dir/$bamqc_out > $out_dir/$bamqc_out\_bamqc.log 2> $out_dir/$bamqc_out\_bamqc.err"; + exe_cmd($cmd,$verbose,$dry); + } + if(defined(can_run("multiqc")) and scalar(@sorted_bams) > 1){ + $cmd = "multiqc -s -m qualimap -o $out_dir $out_dir > $out_dir/multiqc.log 2> $out_dir/multiqc.err"; + exe_cmd($cmd,$verbose,$dry); + } } } -my $cov_hist_file; -my $peak_cov; +my %cov_files; +my %peak_cov; +my %n0_all; +my $global_ymax = 0; if($create_histo_switch == 1){ - $cov_hist_file = "$out_dir/$prefix.cov-hist"; - $cmd = "samtools view -b -h -F 256 $sorted_bam_file | bedtools genomecov -ibam stdin -d | awk \'{print \$3}\' | sort -g | uniq -c | awk '{print \$2\"\\t\"\$1}' > $cov_hist_file"; - exe_cmd($cmd,$verbose,$dry); - - if($dry == 0){ - $peak_cov = `sort -rgk2 $cov_hist_file | awk \'\$1!=0{print \$1}\' | head -1`; - chomp $peak_cov; - my $n0 = `awk \'\$1==0{print \$2}\' $cov_hist_file`; - chomp $n0; - my $ymax = `sort -rgk2 $cov_hist_file | awk \'\$1!=0{print \$2}\' | head -1`; - chomp $ymax; - - open(R,'>',$cov_hist_file . ".plot.r") or die "ERROR\tCould not open file " . $cov_hist_file . "plot.r\n"; + foreach(@sorted_bams){ + + my $tech = "Illumina"; + if($_ =~ m/\.pb\.sort\.bam$/){ + $tech = "PacBio"; + } + if($_ =~ m/\.ont\.sort\.bam$/){ + $tech = "Nanopore"; + } - print R "x=read.table(\"$cov_hist_file\")\n"; - print R "pdf(\"$cov_hist_file.pdf\")\n"; - if($n0 < $ymax){ - print R "plot(x[,1],x[,2],log=\"x\",type=\"l\",xlab=\"Coverage\",ylab=\"Count\")\n"; + my $filename = (split(/\//,$_))[-1]; + my $cov_hist_file = "$out_dir/$filename.cov-hist"; + $cmd = "samtools view -b -h -F 256 $_ | bedtools genomecov -ibam stdin -d | awk \'{print \$3}\' | sort -g | uniq -c | awk '{print \$2\"\\t\"\$1}' > $cov_hist_file"; + exe_cmd($cmd,$verbose,$dry); + + $cov_files{$tech} = $cov_hist_file; + + if($dry == 0){ + my $peak = `sort -rgk2 $cov_hist_file | awk \'\$1!=0{print \$1}\' | head -1`; + chomp $peak; + $peak_cov{$tech} = $peak; + my $n0 = `awk \'\$1==0{print \$2}\' $cov_hist_file`; + chomp $n0; + $n0_all{$tech} = $n0; + my $ymax = `sort -rgk2 $cov_hist_file | awk \'\$1!=0{print \$2}\' | head -1`; + chomp $ymax; + if($ymax > $global_ymax){ + $global_ymax = $ymax; + } + + open(R,'>',$cov_hist_file . ".plot.r") or die "ERROR\tCould not open file " . $cov_hist_file . "plot.r\n"; + + print R "x=read.table(\"$cov_hist_file\")\n"; + print R "pdf(\"$cov_hist_file.pdf\")\n"; + if($n0 < $ymax){ + print R "plot(x[,1],x[,2],log=\"x\",type=\"l\",xlab=\"Coverage\",ylab=\"Count\",main=\"$assembly\\n$tech\")\n"; + } + else{ + print R "plot(x[,1],x[,2],ylim=c(0,$ymax),log=\"x\",type=\"l\",xlab=\"Coverage\",ylab=\"Count\",main=\"$assembly\\n$tech\")\n"; + } + print R "text(2.5,$ymax,\"N(0)=$n0\")\n"; + print R "dev.off()\n"; + + close R; } else{ - print R "plot(x[,1],x[,2],ylim=c(0,$ymax),log=\"x\",type=\"l\",xlab=\"Coverage\",ylab=\"Count\")\n"; + print STDERR "CMD\tsort -rgk2 $cov_hist_file | awk \'\$1!=0{print \$1}\' | head -1\n"; + print STDERR "CMD\tawk \'\$1==0{print \$2}\' $cov_hist_file\n"; + print STDERR "I would create $cov_hist_file.plot.r\n"; } - print R "text(2.5,$ymax,\"N(0)=$n0\")\n"; - print R "dev.off()"; - close R; - } - else{ - print STDERR "CMD\tsort -rgk2 $cov_hist_file | awk \'\$1!=0{print \$1}\' | head -1\n"; - print STDERR "CMD\tawk \'\$1==0{print \$2}\' $cov_hist_file\n"; - print STDERR "I would create $cov_hist_file.plot.r\n"; + if(defined(can_run("Rscript"))){ + $cmd = "Rscript $cov_hist_file.plot.r > /dev/null 2> /dev/null"; + exe_cmd($cmd,$verbose,$dry); + } } - if(defined(can_run("Rscript"))){ - $cmd = "Rscript $cov_hist_file.plot.r > /dev/null 2> /dev/null"; - exe_cmd($cmd,$verbose,$dry); + if(scalar(@sorted_bams) > 1){ + if($dry == 0){ + my @techs = ("Illumina","PacBio","Nanopore"); + + open(RALL,'>',"$out_dir/plot.all.r") or die "ERROR\tCould not open file $out_dir/plot.all.r\n"; + + for(my $i = 0; $i < scalar(@techs); $i++){ + if(exists($cov_files{$techs[$i]})){ + print RALL "$techs[$i]=read.table(\"$cov_files{$techs[$i]}\")\n"; + } + } + print RALL "pdf(\"$out_dir/plot.all.pdf\")\n"; + my @legend = (); + my @lty = (); + my @col = (); + for(my $i = 0; $i < scalar(@techs); $i++){ + if(exists $cov_files{$techs[$i]}){ + push(@legend,"\"$techs[$i] N(0)=$n0_all{$techs[$i]}\""); + push(@lty,"1"); + if($i == 0 and exists($cov_files{$techs[$i]})){ + print RALL "plot($techs[$i]\[,1],$techs[$i]\[,2],log=\"x\",type=\"l\",xlab=\"Coverage\",ylab=\"Count\",main=\"$assembly\",ylim=c(0,$global_ymax))\n"; + push(@col,"\"black\""); + } + if($i == 1 and exists($cov_files{$techs[$i]})){ + print RALL "lines($techs[$i]\[,1],$techs[$i]\[,2],type=\"l\",col=\"blue\")\n"; + push(@col,"\"blue\""); + } + if($i == 2 and exists($cov_files{$techs[$i]})){ + print RALL "lines($techs[$i]\[,1],$techs[$i]\[,2],type=\"l\",col=\"red\")\n"; + push(@col,"\"red\""); + } + } + } + print RALL "legend(\"topright\",legend=c(" . join(",",@legend) . "),lty=c(" . join(",",@lty) . "),col=c(" . join(",",@col) . "))\n"; + print RALL "dev.off()\n"; + + close RALL; + } + else{ + print STDERR "I would create $out_dir/plot.all.r\n"; + } + + if(defined(can_run("Rscript"))){ + $cmd = "Rscript $out_dir/plot.all.r > /dev/null 2> /dev/null"; + exe_cmd($cmd,$verbose,$dry); + } } + } if($estimate_genome_size_switch == 1){ - if($dry == 0){ - $peak_cov = `sort -rgk2 $cov_hist_file | awk \'\$1!=0{print \$1}\' | head -1`; - chomp $peak_cov; - } - else{ - print STDERR "CMD\tsort -rgk2 $cov_hist_file | awk \'\$1!=0{print \$1}\' | head -1\n"; - } - - $cmd = "samtools stats $sorted_bam_file > $out_dir/$prefix.stats 2> $out_dir/$prefix.stats.err"; - exe_cmd($cmd,$verbose,$dry); - - my $total_nucs; - if($dry == 0){ - $total_nucs = `grep "bases mapped (cigar):" $out_dir/$prefix.stats | awk -F'\\t' '{print \$3}'`; - } - else{ - print STDERR "CMD\tgrep \"bases mapped (cigar):\" $out_dir/$prefix.stats | awk -F\'\\t\' \'{print \$3}\'\n"; - } - if($dry == 0){ print "\n"; print "Output\n"; - print "==================\n"; - print "Mapped nucleotides: " . format_pref($total_nucs) . "b\n"; - print "Peak coverage: " . $peak_cov . "\n"; + print "======\n"; + } + + foreach(@sorted_bams){ + if($dry == 1){ + print STDERR "CMD\tsort -rgk2 $_.cov-hist | awk \'\$1!=0{print \$1}\' | head -1\n"; + } - my $genome_size_estimate = $total_nucs / $peak_cov; - print "Genome size estimate: " . format_pref($genome_size_estimate) . "b\n"; + $cmd = "samtools stats $_ > $_.stats 2> $_.stats.err"; + exe_cmd($cmd,$verbose,$dry); + + if($dry == 0){ + my $assembly_length = 0; + my $assemly_perc = 0; + if($assembly ne ""){ + open(IN,'<',"$assembly_path") or die "ERROR\tCould not open file " . $assembly_path . "\n"; + while(my $line = ){ + chomp $line; + if($line !~ m/^>/){ + $assembly_length = $assembly_length + length($line); + } + } + } + my $total_nucs = `grep "bases mapped (cigar):" $_.stats | awk -F'\\t' '{print \$3}'`; + chomp $total_nucs; + + my $tech = "Illumina"; + if($_ =~ m/\.pb\.sort\.bam$/){ + $tech = "PacBio"; + } + if($_ =~ m/\.ont\.sort\.bam$/){ + $tech = "Nanopore"; + } + + my $genome_size_estimate = $total_nucs / $peak_cov{$tech}; + + print $tech . " (" . $_ . ")\n"; + print "Mapped nucleotides: " . round_format_pref($total_nucs) . "b\n"; + print "Peak coverage: " . $peak_cov{$tech} . "\n"; + print "Genome size estimate: " . round_format_pref($genome_size_estimate) . "b\n"; + if($assembly_length > 0){ + $assemly_perc = sprintf("%.2f", ($assembly_length / $genome_size_estimate) * 100); + print "Assembly length: " . round_format_pref($assembly_length) . "b ($assemly_perc% of estimate)\n"; + } + } + else{ + print STDERR "CMD\tgrep \"bases mapped (cigar):\" $_.stats | awk -F\'\\t\' \'{print \$3}\'\n"; + } } + } exit;