diff --git a/README.md b/README.md index a478964..b9684bc 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -# backmap.pl v0.4 +# backmap.pl v0.5 ## Description __Automatic read mapping and genome size estimation from coverage.__ -Automatic mapping of paired, unpaired, PacBio and Nanopore reads to an assembly with `bwa mem` or `minimap2`, execution of `qualimap bamqc` and estimation of genome size from mapped nucleotides divided by mode of the coverage distribution (>0). This method was first pulished in Schell et al. (2017) and is slightly refined in this script. +Automatic mapping of paired, unpaired, PacBio and Nanopore reads to an assembly with `bwa mem` or `minimap2`, execution of `qualimap bamqc` and estimation of genome size from mapped nucleotides divided by mode of the coverage distribution (>0). This method was first pulished in Schell et al. (2017). To show high accuracy and reliability of this method throughout the tree of life, Pfenninger et al. (2021) published a study comparing different estimators. Currently, the estimator Nbm/m (number of back-mapped bases divided by the modal value of the sequencing depth distribution) is implemented in this script only. The tools `samtools`, `bwa` and/or `minimap2` need to be in your `$PATH`. The tools `qualimap`, `multiqc`, `bedtools` and `Rscript` are optional but needed to create the mapping quality report, coverage histogram as well as genome size estimation and to plot of the coverage distribution respectively. ## Dependencies @@ -46,8 +46,9 @@ Mandatory: Skips read mapping Overrides -nh Technologies will recognized correctly if filenames end with - .pb(.sort).bam or .ont(.sort).bam for PacBio and Nanopore respectively. - Otherwise they are assumed to be from Illumina. + .pb(.sort).bam, .hifi(.sort).bam or .ont(.sort).bam for PacBio CLR, + PacBio HiFi and Nanopore respectively. Otherwise they are assumed to + be from Illumina. All mandatory options except of -a can be specified multiple times @@ -55,7 +56,7 @@ Options: [default] -o STR Output directory [.] Will be created if not existing -t INT Number of parallel executed processes [1] - Affects bwa mem, samtools sort, qualimap bamqc + Affects bwa mem, samtools sort/index/view/stats, qualimap bamqc -pre STR Prefix of output files if -a is used [filename of -a] -sort Sort the bam file(s) (-b) [off] -nq Do not run qualimap bamqc [off] @@ -76,6 +77,8 @@ Options: [default] ``` ## Citation +Pfenninger M, Schönenbeck P & Schell T (2021). ModEst: Accurate estimation of genome size from next generation sequencing data. _Molecular ecology resources_, 00, 1–11. + Schell T, Feldmeyer B, Schmidt H, Greshake B, Tills O et al. (2017). An Annotated Draft Genome for _Radix auricularia_ (Gastropoda, Mollusca). _Genome Biology and Evolution_, 9(3):585–592, __If you use this tool please cite the dependencies as well:__ diff --git a/backmap.pl b/backmap.pl index bb237f6..8bdd050 100755 --- a/backmap.pl +++ b/backmap.pl @@ -7,7 +7,7 @@ use Number::FormatEng qw(:all); use Parallel::Loops; -my $version = "0.4"; +my $version = "0.5"; sub print_help{ print STDOUT "\n"; @@ -32,14 +32,15 @@ sub print_help{ print STDOUT "\t-b STR\t\tBam file to calculate coverage from\n"; print STDOUT "\t\t\tSkips read mapping\n"; print STDOUT "\t\t\tOverrides -nh\n"; - print STDOUT "\t\t\tTechnologies will recognized correctly if filenames end with\n\t\t\t.pb(.sort).bam or .ont(.sort).bam for PacBio and Nanopore respectively.\n\t\t\tOtherwise they are assumed to be from Illumina.\n"; + print STDOUT "\t\t\tTechnologies will recognized correctly if filenames end with\n\t\t\t.pb(.sort).bam, .hifi(.sort).bam or .ont(.sort).bam for PacBio CLR,\n\t\t\tPacBio HiFi and Nanopore respectively. Otherwise they are assumed to\n\t\t\tbe from Illumina.\n"; + print STDOUT "\n"; print STDOUT "\tAll mandatory options except of -a can be specified multiple times\n"; print STDOUT "\n"; print STDOUT "Options: [default]\n"; print STDOUT "\t-o STR\t\tOutput directory [.]\n"; print STDOUT "\t\t\tWill be created if not existing\n"; print STDOUT "\t-t INT\t\tNumber of parallel executed processes [1]\n"; - print STDOUT "\t\t\tAffects bwa mem, samtools sort, qualimap bamqc\n"; + print STDOUT "\t\t\tAffects bwa mem, samtools sort/index/view/stats, qualimap bamqc\n"; print STDOUT "\t-pre STR\tPrefix of output files if -a is used [filename of -a]\n"; print STDOUT "\t-sort\t\tSort the bam file(s) (-b) [off]\n"; print STDOUT "\t-nq\t\tDo not run qualimap bamqc [off]\n"; @@ -210,7 +211,7 @@ sub round_format_pref{ $input_error = 1; } } - if(scalar(@pb) > 0 or scalar(@ont) > 0){ + if(scalar(@pb) > 0 or scalar(@hifi) > 0 or scalar(@ont) > 0){ if(not defined(can_run("minimap2"))){ print STDERR "ERROR\tminimap2 is not in your \$PATH\n"; $input_error = 1; @@ -738,6 +739,11 @@ sub round_format_pref{ } } +foreach(@sorted_bams){ + $cmd = "samtools index -@ $samtools_threads $_"; + exe_cmd($cmd,$verbose,$dry); +} + if($run_bamqc_switch == 1){ if(defined(can_run("qualimap"))){ foreach(@sorted_bams){ @@ -766,6 +772,13 @@ sub round_format_pref{ if($create_histo_switch == 1){ + foreach(@sorted_bams){ + my $filename = (split(/\//,$_))[-1]; + my $cov_hist_file = "$out_dir/$filename.cov-hist"; + $cmd = "samtools view -@ $samtools_threads -b -h -F 256 $_ | bedtools genomecov -ibam stdin -d | awk \'{print \$3}\' | sort -g | uniq -c | awk '{print \$2\"\\t\"\$1}' > $cov_hist_file"; + exe_cmd($cmd,$verbose,$dry); + } + my $multiple_histos = Parallel::Loops->new($maxProcs); $multiple_histos->share(\%cov_files); $multiple_histos->share(\%peak_cov); @@ -787,8 +800,6 @@ sub round_format_pref{ my $filename = (split(/\//,$_))[-1]; my $cov_hist_file = "$out_dir/$filename.cov-hist"; - $cmd = "samtools view -b -h -F 256 $_ | bedtools genomecov -ibam stdin -d | awk \'{print \$3}\' | sort -g | uniq -c | awk '{print \$2\"\\t\"\$1}' > $cov_hist_file"; - exe_cmd($cmd,$verbose,$dry); $cov_files{$tech} = $cov_hist_file; @@ -899,7 +910,12 @@ sub round_format_pref{ if($estimate_genome_size_switch == 1){ my %results; - + + foreach(@sorted_bams){ + $cmd = "samtools stats -@ $samtools_threads $_ > $_.stats 2> $_.stats.err"; + exe_cmd($cmd,$verbose,$dry); + } + my $multiple_genome_size = Parallel::Loops->new($maxProcs); $multiple_genome_size->share(\%results); @@ -908,8 +924,8 @@ sub round_format_pref{ print STDERR "CMD\tsort -rgk2 $_.cov-hist | awk \'\$1!=0{print \$1}\' | head -1\n"; } - $cmd = "samtools stats $_ > $_.stats 2> $_.stats.err"; - exe_cmd($cmd,$verbose,$dry); +# $cmd = "samtools stats $_ > $_.stats 2> $_.stats.err"; +# exe_cmd($cmd,$verbose,$dry); if($dry == 0){