From 2e8d49cd31e3f0c1f1f65f333ca7682ae5bcf7c4 Mon Sep 17 00:00:00 2001 From: Ben Woodcroft Date: Fri, 28 Aug 2020 16:56:29 +1000 Subject: [PATCH] v0.5.0 --- Cargo.toml | 13 +- README.md | 34 +-- docs/coverm-cluster.html | 325 +++++++++++++++++++++ docs/coverm-contig.html | 451 ++++++++++++++++++++++++++++ docs/coverm-filter.html | 254 ++++++++++++++++ docs/coverm-genome.html | 614 +++++++++++++++++++++++++++++++++++++++ docs/coverm-make.html | 292 +++++++++++++++++++ prelude | 10 + release.sh | 26 +- src/bin/coverm.rs | 6 +- src/cli.rs | 428 ++++++++++++++++----------- src/lib.rs | 8 +- 12 files changed, 2260 insertions(+), 201 deletions(-) create mode 100644 docs/coverm-cluster.html create mode 100644 docs/coverm-contig.html create mode 100644 docs/coverm-filter.html create mode 100644 docs/coverm-genome.html create mode 100644 docs/coverm-make.html create mode 100644 prelude diff --git a/Cargo.toml b/Cargo.toml index 9428d77..775909f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "coverm" -version = "0.4.0" +version = "0.5.0" authors = ["Ben Woodcroft "] license = "GPL-3.0" description = "Read coverage calculator for metagenomics" @@ -19,17 +19,18 @@ rust-htslib = { version="0.32.*", default-features = false } clap = "2.*" log = "0.4.*" env_logger = "0.7.*" -nix = "0.17.*" +nix = "0.18.*" tempdir = "0.3.*" tempfile = "3.*" ansi_term = "0.12" lazy_static = "1.4.0" rand = "0.7.*" serde = "1.0" -version-compare = "0.0.10" -bird_tool_utils = {git = "https://github.com/wwood/bird_tool_utils", rev = "3b16f11ed421ae74c1a140f3894da02584bcb5b5"} -galah = {git = "https://github.com/wwood/galah", rev = "e55f48cf4c9f23a1f362df6f05be518895f2b480"} -man = {git = "https://github.com/wwood/man", rev = "6aed86ec55ab6c867b391658586a597770c6dcb1"} +version-compare = "0.0.11" +bird_tool_utils = "0.2.0" +galah = "0.2.0" +bird_tool_utils-man = "0.4.0" +roff = "0.1.*" [dev-dependencies] assert_cli = "0.6.*" diff --git a/README.md b/README.md index 92dd454..40d0a5b 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,9 @@ CoverM aims to be a configurable, easy to use and fast DNA read coverage and relative abundance calculator focused on metagenomics applications. -CoverM calculates coverage of genomes/MAGs (`coverm genome`) or individual -contigs (`coverm contig`). Calculating coverage by read mapping, its input can -either be BAM files sorted by reference, or raw reads and reference FASTA -sequences. +CoverM calculates coverage of genomes/MAGs `coverm genome` ([help](https://wwood.github.io/coverm/coverm-genome.html)) or individual +contigs `coverm contig` ([help](https://wwood.github.io/coverm/coverm-contig.html)). Calculating coverage by read mapping, its input can +either be BAM files sorted by reference, or raw reads and reference genomes in various formats. ## Installation @@ -97,27 +96,16 @@ coverm shell-completion --shell bash --output-file /dev/stdout >>~/.bash_complet In both cases, to take effect, the terminal will likely need to be restarted. To test, type `coverm gen` and it should complete after pressing the TAB key. ## Usage -``` -Mapping coverage analysis for metagenomics - -Usage: coverm ... - -Main subcommands: - contig Calculate coverage of contigs - genome Calculate coverage of genomes -Less used utility subcommands: - make Generate BAM files through alignment - filter Remove (or only keep) alignments with insufficient identity - cluster Dereplicate and cluster genomes - shell-completion - Generate shell completion scripts - -Other options: - -V, --version Print version information -``` +CoverM operates in several modes. Detailed usage information is given at the links below, or alternatively by using the `-h` or `--full-help` flags for each mode: +* [genome](https://wwood.github.com/coverm/coverm-genome.html) - Calculate coverage of genomes +* [contig](https://wwood.github.com/coverm/coverm-contig.html) - Calculate coverage of contigs -For more detailed usage see `coverm -h` or `coverm --full-help`. +There are several utility modes as well: +* [make](https://wwood.github.com/coverm/coverm-make.html) - Generate BAM files through alignment +* [filter](https://wwood.github.com/coverm/coverm-filter.html) - Remove (or only keep) alignments with insufficient identity +* [cluster](https://wwood.github.com/coverm/coverm-cluster.html) - Dereplicate and cluster genomes +* shell-completion - Generate shell completion scripts ## Calculation methods diff --git a/docs/coverm-cluster.html b/docs/coverm-cluster.html new file mode 100644 index 0000000..698e81b --- /dev/null +++ b/docs/coverm-cluster.html @@ -0,0 +1,325 @@ + + + + + + + + + + + + + + + + +coverm cluster usage + + + + + + + + + + + + + + + + + + + + +
+
+
+
+coverm cluster usage +Ben Woodcroft, Centre for Microbiome Research, Queensland University of Technology +2020-08-28 (coverm 0.5.0) +
+
+
+ + +
+
+ + +
+
+
+

NAME

+

coverm cluster - Cluster genome FASTA files by average nucleotide identity (version 0.5.0)

+
+
+

SYNOPSIS

+

coverm cluster <GENOME_INPUTS> <OUTPUT_ARGUMENTS>

+
+
+

DESCRIPTION

+

This cluster mode dereplicates genomes, choosing a subset of the input genomes as representatives. Required inputs are (1) a genome definition, and (2) an output format definition.

+

The source code for this program can be found at https://github.com/wwood/galah or https://github.com/wwood/coverm

+
+
+

GENOME INPUT

+
+
-f, --genome-fasta-files PATH ..
+

Path(s) to FASTA files of each genome e.g. pathA/genome1.fna pathB/genome2.fa.

+
+
+ +
+
-d, --genome-fasta-directory PATH
+

Directory containing FASTA files of each genome.

+
+
+ +
+
-x, --genome-fasta-extension EXT
+

File extension of genomes in the directory specified with -d/--genome-fasta-directory. [default: fna]

+
+
+ +
+
--genome-fasta-list PATH
+

File containing FASTA file paths, one per line.

+
+
+
+
+

FILTERING PARAMETERS

+
+
--checkm-tab-table PATH
+

CheckM tab table (i.e. the output of checkm .. --tab_table -f PATH ..) for defining genome quality, which is used both for filtering and to rank genomes during clustering.

+
+
+ +
+
--genome-info PATH
+

dRep style genome info table for defining quality. Used like --checkm-tab-table.

+
+
+ +
+
--min-completeness FLOAT
+

Ignore genomes with less completeness than this percentage. [default: not set]

+
+
+ +
+
--max-contamination FLOAT
+

Ignore genomes with more contamination than this percentage. [default: not set]

+
+
+
+
+

CLUSTERING PARAMETERS

+
+
--ani FLOAT
+

Overall ANI level to dereplicate at with FastANI. [default: 99]

+
+
+ +
+
--min-aligned-fraction FLOAT
+

Min aligned fraction of two genomes for clustering. [default: 50]

+
+
+ +
+
--fragment-length FLOAT
+

Length of fragment used in FastANI calculation (i.e. --fragLen). [default: 3000]

+
+
+ +

--quality-formula FORMULA

+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Scoring function for genome quality [default: Parks2020_reduced]. One of:
formuladescription
Parks2020_reduced(default) A quality formula described in Parks et. al. 2020 https://doi.org/10.1038/s41587-020-0501-8 (Supplementary Table 19) but only including those scoring criteria that can be calculated from the sequence without homology searching: completeness-5*contamination-5*num_contigs/100-5*num_ambiguous_bases/100000
completeness-4contaminationcompleteness-4*contamination
completeness-5contaminationcompleteness-5*contamination
dRepcompleteness-5*contamination+contamination*(strain_heterogeneity/100)+0.5*log10(N50)
+
+
--precluster-ani FLOAT
+

Require at least this dashing-derived ANI for preclustering and to avoid FastANI on distant lineages within preclusters. [default: 95]

+
+
+ +
+
--precluster-method NAME
+

method of calculating rough ANI for dereplication. 'dashing' for HyperLogLog, 'finch' for finch MinHash. [default: dashing]

+
+
+
+
+

OUTPUT

+
+
--output-cluster-definition PATH
+

Output a file of representative<TAB>member lines.

+
+
+ +
+
--output-representative-fasta-directory PATH
+

Symlink representative genomes into this directory.

+
+
+ +
+
--output-representative-fasta-directory-copy PATH
+

Copy representative genomes into this directory.

+
+
+ +
+
--output-representative-list PATH
+

Print newline separated list of paths to representatives into this file.

+
+
+
+
+

GENERAL PARAMETERS

+
+
-t, --threads INT
+

Number of threads. [default: 1]

+
+
+ +
+
-v, --verbose
+

Print extra debugging information

+
+
+ +
+
-q, --quiet
+

Unless there is an error, do not print log messages

+
+
+ +
+
-h, --help
+

Output a short usage message.

+
+
+ +
+
--full-help
+

Output a full help message and display in 'man'.

+
+
+ +
+
--full-help-roff
+

Output a full help message in raw ROFF format for conversion to other formats.

+
+
+
+
+

EXIT STATUS

+
+
0
+

Successful program execution.

+
+
+ +
+
1
+

Unsuccessful program execution.

+
+
+ +
+
101
+

The program panicked.

+
+
+
+
+

AUTHOR

+
+
Ben J. Woodcroft, Centre for Microbiome Research, Queensland University of Technology <benjwoodcroft near gmail.com>
+
+
+
+
+
+
+
+ + + + + + + + diff --git a/docs/coverm-contig.html b/docs/coverm-contig.html new file mode 100644 index 0000000..b05706a --- /dev/null +++ b/docs/coverm-contig.html @@ -0,0 +1,451 @@ + + + + + + + + + + + + + + + + +coverm contig usage + + + + + + + + + + + + + + + + + + + + +
+
+
+
+coverm contig usage +Ben Woodcroft, Centre for Microbiome Research, Queensland University of Technology +2020-08-28 (coverm 0.5.0) +
+
+
+ + +
+
+ + +
+
+
+

NAME

+

coverm contig - Calculate read coverage per-contig (version 0.5.0)

+
+
+

SYNOPSIS

+

coverm contig <MAPPING_INPUT> ..

+
+
+

DESCRIPTION

+

coverm contig calculates the coverage of a set of reads on a set of contigs.

+

This process can be undertaken in several ways, for instance by specifying BAM files or raw reads as input, using different mapping programs, thresholding read alignments, using different methods of calculating coverage and printing the calculated coverage in various formats.

+

The source code for CoverM is available at https://github.com/wwood/CoverM

+
+
+

READ MAPPING PARAMETERS

+
+
-1 PATH ..
+

Forward FASTA/Q file(s) for mapping. These may be gzipped or not.

+
+
+ +
+
-2 PATH ..
+

Reverse FASTA/Q file(s) for mapping. These may be gzipped or not.

+
+
+ +
+
-c, --coupled PATH ..
+

One or more pairs of forward and reverse possibly gzipped FASTA/Q files for mapping in order <sample1_R1.fq.gz> <sample1_R2.fq.gz> <sample2_R1.fq.gz> <sample2_R2.fq.gz> ..

+
+
+ +
+
--interleaved PATH ..
+

Interleaved FASTA/Q files(s) for mapping. These may be gzipped or not.

+
+
+ +
+
--single PATH ..
+

Unpaired FASTA/Q files(s) for mapping. These may be gzipped or not.

+
+
+ +
+
-b, --bam-files PATH
+

Path to BAM file(s). These must be reference sorted (e.g. with samtools sort) unless --sharded is specified, in which case they must be read name sorted (e.g. with samtools sort -n). When specified, no read mapping algorithm is undertaken.

+
+
+
+
+

REFERENCE

+
+
-r, --reference PATH
+

FASTA file of contigs e.g. concatenated genomes or metagenome assembly, or minimap2 index (with --minimap2-reference-is-index), or BWA index stem (with -p bwa-mem). If multiple references FASTA files are provided and --sharded is specified, then reads will be mapped to references separately as sharded BAMs. [required unless -b/--bam-files is specified]

+
+
+
+
+

SHARDING

+
+
--sharded
+

If -b/--bam-files was used: Input BAM files are read-sorted alignments of a set of reads mapped to multiple reference contig sets. Choose the best hit for each read pair. Otherwise if mapping was carried out: Map reads to each reference, choosing the best hit for each pair. [default: not set]

+
+
+
+
+

MAPPING ALGORITHM OPTIONS

+

-p, --mapper NAME

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Underlying mapping software used [default: minimap2-sr]. One of:
namedescription
minimap2-srminimap2 with '-x sr' option
bwa-membwa mem using default parameters
minimap2-ontminimap2 with '-x map-ont' option
minimap2-pbminimap2 with '-x map-pb' option
minimap2-no-presetminimap2 with no '-x' option
+
+
--minimap2-params PARAMS
+

Extra parameters to provide to minimap2, both indexing command (if used) and for mapping. Note that usage of this parameter has security implications if untrusted input is specified. '-a' is always specified to minimap2. [default: none]

+
+
+ +
+
--minimap2-reference-is-index
+

Treat reference as a minimap2 database, not as a FASTA file. [default: not set]

+
+
+ +
+
--bwa-params PARAMS
+

Extra parameters to provide to BWA. Note that usage of this parameter has security implications if untrusted input is specified. [default: none]

+
+
+
+
+

ALIGNMENT THRESHOLDING

+
+
--min-read-aligned-length INT
+

Exclude reads with smaller numbers of aligned bases. [default: 0]

+
+
+ +
+
--min-read-percent-identity FLOAT
+

Exclude reads by overall percent identity e.g. 0.95 for 95%. [default: 0.0]

+
+
+ +
+
--min-read-aligned-percent FLOAT
+

Exclude reads by percent aligned bases e.g. 0.95 means 95% of the read's bases must be aligned. [default: 0.0]

+
+
+ +
+
--min-read-aligned-length-pair INT
+

Exclude pairs with smaller numbers of aligned bases. Implies --proper-pairs-only. [default: 0]

+
+
+ +
+
--min-read-percent-identity-pair FLOAT
+

Exclude pairs by overall percent identity e.g. 0.95 for 95%. Implies --proper-pairs-only. [default: 0.0]

+
+
+ +
+
--min-read-aligned-percent-pair FLOAT
+

Exclude reads by percent aligned bases e.g. 0.95 means 95% of the read's bases must be aligned. Implies --proper-pairs-only. [default: 0.0]

+
+
+ +
+
--proper-pairs-only
+

Require reads to be mapped as proper pairs. [default: not set]

+
+
+ +
+
--exclude-supplementary
+

Exclude supplementary alignments. [default: not set]

+
+
+
+
+

COVERAGE CALCULATION OPTIONS

+

-m, --methods METHOD

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Method(s) for calculating coverage [default: mean]. A more thorough description of the different methods is available at https://github.com/wwood/CoverM\#calculation-methods but briefly:
methoddescription
mean(default) Average number of aligned reads overlapping each position on the contig
trimmed_meanAverage number of aligned reads overlapping each position after removing the most deeply and shallow-ly covered positions. See --trim-min/--trim-max to adjust.
coverage_histogramHistogram of coverage depths
covered_basesNumber of bases covered by 1 or more reads
varianceVariance of coverage depths
lengthLength of each contig in base pairs
countNumber of reads aligned toq each contig. Note that a single read may be aligned to multiple contigs with supplementary alignments
metabat("MetaBAT adjusted coverage") Coverage as defined in Kang et al 2015 https://doi.org/10.7717/peerj.1165
reads_per_baseNumber of reads aligned divided by the length of the contig
rpkmReads mapped per kilobase of contig, per million mapped reads
+
+
--min-covered-fraction FRACTION
+

Genomes with less coverage than this reported as having zero coverage. [default: 0.10]

+
+
+ +
+
--contig-end-exclusion INT
+

Exclude bases at the ends of reference sequences from calculation [default: 75]

+
+
+ +
+
--trim-min FRACTION
+

Remove this smallest fraction of positions when calculating trimmed_mean [default: 0.05]

+
+
+ +
+
--trim-max FRACTION
+

Maximum fraction for trimmed_mean calculations [default: 0.95]

+
+
+
+
+

OUTPUT

+
+
--output-format FORMAT
+

Shape of output: 'sparse' for long format, 'dense' for species-by-site. [default: dense]

+
+
+ +
+
--no-zeros
+

Omit printing of genomes that have zero coverage. [default: not set]

+
+
+ +
+
--bam-file-cache-directory DIRECTORY
+

Output BAM files generated during alignment to this directory. The directory may or may not exist. [default: not used]

+
+
+ +
+
--discard-unmapped
+

Exclude unmapped reads from cached BAM files. [default: not set]

+
+
+
+
+

GENERAL OPTIONS

+
+
-t, --threads INT
+

Number of threads for mapping, sorting and reading. [default: 1]

+
+
+ +
+
-h, --help
+

Output a short usage message. [default: not set]

+
+
+ +
+
--full-help
+

Output a full help message and display in 'man'. [default: not set]

+
+
+ +
+
--full-help-roff
+

Output a full help message in raw ROFF format for conversion to other formats. [default: not set]

+
+
+ +
+
-v, --verbose
+

Print extra debugging information. [default: not set]

+
+
+ +
+
-q, --quiet
+

Unless there is an error, do not print log messages. [default: not set]

+
+
+
+
+

EXIT STATUS

+
+
0
+

Successful program execution.

+
+
+ +
+
1
+

Unsuccessful program execution.

+
+
+ +
+
101
+

The program panicked.

+
+
+
+
+

AUTHOR

+
+
Ben J. Woodcroft, Centre for Microbiome Research, Queensland University of Technology <benjwoodcroft near gmail.com>
+
+
+
+
+
+
+
+ + + + + + + + diff --git a/docs/coverm-filter.html b/docs/coverm-filter.html new file mode 100644 index 0000000..014bcca --- /dev/null +++ b/docs/coverm-filter.html @@ -0,0 +1,254 @@ + + + + + + + + + + + + + + + + +coverm filter usage + + + + + + + + + + + + + + + + + + + + +
+
+
+
+coverm filter usage +Ben Woodcroft, Centre for Microbiome Research, Queensland University of Technology +2020-08-28 (coverm 0.5.0) +
+
+
+ + +
+
+ + +
+
+
+

NAME

+

coverm filter - Threshold alignments with insufficient identity (version 0.5.0)

+
+
+

SYNOPSIS

+

coverm filter FLAGS OPTIONS

+
+
+

DESCRIPTION

+

Only primary, non-supplementary alignments are considered, and output files are grouped by reference, but not sorted by position.

+
+
+

FLAGS

+
+
--inverse
+

Only keep reads which are unmapped or align below thresholds. Note that output records may still be marked as mapped if they do not meet the thresholds. [default: not set]

+
+
+ +
+
-v, --verbose
+

Print extra debugging information. [default: not set]

+
+
+ +
+
-q, --quiet
+

Unless there is an error, do not print log messages. [default: not set]

+
+
+ +
+
-h, --help
+

Output a short usage message. [default: not set]

+
+
+ +
+
--full-help
+

Output a full help message and display in 'man'. [default: not set]

+
+
+ +
+
--full-help-roff
+

Output a full help message in raw ROFF format for conversion to other formats. [default: not set]

+
+
+
+
+

OPTIONS

+
+
-b, --bam-files PATH ..
+

Path to reference-sorted BAM file(s). [required]

+
+
+ +
+
-o, --output-bam-files PATH ..
+

Path to corresponding output file(s). [required]

+
+
+ +
+
-t, --threads INT
+

Number of threads for output compression. [default: 1]

+
+
+
+
+

ALIGNMENT THRESHOLDING

+
+
--min-read-aligned-length INT
+

Exclude reads with smaller numbers of aligned bases. [default: 0]

+
+
+ +
+
--min-read-percent-identity FLOAT
+

Exclude reads by overall percent identity e.g. 0.95 for 95%. [default: 0.0]

+
+
+ +
+
--min-read-aligned-percent FLOAT
+

Exclude reads by percent aligned bases e.g. 0.95 means 95% of the read's bases must be aligned. [default: 0.0]

+
+
+ +
+
--min-read-aligned-length-pair INT
+

Exclude pairs with smaller numbers of aligned bases. Implies --proper-pairs-only. [default: 0]

+
+
+ +
+
--min-read-percent-identity-pair FLOAT
+

Exclude pairs by overall percent identity e.g. 0.95 for 95%. Implies --proper-pairs-only. [default: 0.0]

+
+
+ +
+
--min-read-aligned-percent-pair FLOAT
+

Exclude reads by percent aligned bases e.g. 0.95 means 95% of the read's bases must be aligned. Implies --proper-pairs-only. [default: 0.0]

+
+
+ +
+
--proper-pairs-only
+

Require reads to be mapped as proper pairs. [default: not set]

+
+
+ +
+
--exclude-supplementary
+

Exclude supplementary alignments. [default: not set]

+
+
+
+
+

EXIT STATUS

+
+
0
+

Successful program execution.

+
+
+ +
+
1
+

Unsuccessful program execution.

+
+
+ +
+
101
+

The program panicked.

+
+
+
+
+

AUTHOR

+
+
Ben J. Woodcroft, Centre for Microbiome Research, Queensland University of Technology <benjwoodcroft near gmail.com>
+
+
+
+
+
+
+
+ + + + + + + + diff --git a/docs/coverm-genome.html b/docs/coverm-genome.html new file mode 100644 index 0000000..4243a49 --- /dev/null +++ b/docs/coverm-genome.html @@ -0,0 +1,614 @@ + + + + + + + + + + + + + + + + +coverm genome usage + + + + + + + + + + + + + + + + + + + + +
+
+
+
+coverm genome usage +Ben Woodcroft, Centre for Microbiome Research, Queensland University of Technology +2020-08-28 (coverm 0.5.0) +
+
+
+ + +
+
+ + +
+
+
+

NAME

+

coverm genome - Calculate read coverage per-genome (version 0.5.0)

+
+
+

SYNOPSIS

+

coverm genome <GENOME_DESCRIPTION> <MAPPING_INPUT> ..

+
+
+

DESCRIPTION

+

coverm genome calculates the coverage of a set of reads on a set of genomes.

+

This process can be undertaken in several ways, for instance by specifying BAM files or raw reads as input, defining genomes in different input formats, dereplicating genomes before mapping, using different mapping programs, thresholding read alignments, using different methods of calculating coverage and printing the calculated coverage in various formats.

+

The source code for CoverM is available at https://github.com/wwood/CoverM

+
+
+

READ MAPPING PARAMETERS

+
+
-1 PATH ..
+

Forward FASTA/Q file(s) for mapping. These may be gzipped or not.

+
+
+ +
+
-2 PATH ..
+

Reverse FASTA/Q file(s) for mapping. These may be gzipped or not.

+
+
+ +
+
-c, --coupled PATH ..
+

One or more pairs of forward and reverse possibly gzipped FASTA/Q files for mapping in order <sample1_R1.fq.gz> <sample1_R2.fq.gz> <sample2_R1.fq.gz> <sample2_R2.fq.gz> ..

+
+
+ +
+
--interleaved PATH ..
+

Interleaved FASTA/Q files(s) for mapping. These may be gzipped or not.

+
+
+ +
+
--single PATH ..
+

Unpaired FASTA/Q files(s) for mapping. These may be gzipped or not.

+
+
+ +
+
-b, --bam-files PATH
+

Path to BAM file(s). These must be reference sorted (e.g. with samtools sort) unless --sharded is specified, in which case they must be read name sorted (e.g. with samtools sort -n). When specified, no read mapping algorithm is undertaken.

+
+
+
+
+

GENOME DEFINITION

+
+
-f, --genome-fasta-files PATH ..
+

Path(s) to FASTA files of each genome e.g. pathA/genome1.fna pathB/genome2.fa.

+
+
+ +
+
-d, --genome-fasta-directory PATH
+

Directory containing FASTA files of each genome.

+
+
+ +
+
-x, --genome-fasta-extension EXT
+

File extension of genomes in the directory specified with -d/--genome-fasta-directory. [default: fna]

+
+
+ +
+
--genome-fasta-list PATH
+

File containing FASTA file paths, one per line.

+
+
+ +
+
-r, --reference PATH
+

FASTA file of contigs e.g. concatenated genomes or metagenome assembly, or minimap2 index (with --minimap2-reference-is-index), or BWA index stem (with -p bwa-mem). If multiple references FASTA files are provided and --sharded is specified, then reads will be mapped to references separately as sharded BAMs. NOTE: If genomic FASTA files are specified elsewhere (e.g. with --genome-fasta-files or --genome-fasta-directory), then --reference is not needed as a reference FASTA file can be derived by concatenating input genomes. However, while not necessary, --reference can be specified if an alternate reference sequence set is desired.

+
+
+ +
+
-s, --separator CHARACTER
+

This character separates genome names from contig names in the reference file. Requires --reference. [default: unspecified]

+
+
+ +
+
--single-genome
+

All contigs are from the same genome. Requires --reference. [default: not set]

+
+
+ +
+
--genome-definition FILE
+

File containing list of genome_name<tab>contig lines to define the genome of each contig. Requires --reference. [default: not set]

+
+
+
+
+

DEREPLICATION / GENOME CLUSTERING

+
+
--dereplicate
+

Do genome dereplication via average nucleotide identity (ANI) - choose a genome to represent all within a small distance, using Dashing for preclustering and FastANI for final ANI calculation. When this flag is used, dereplication occurs transparently through the Galah method (https://github.com/wwood/galah) [default: not set]

+
+
+ +
+
--checkm-tab-table PATH
+

CheckM tab table (i.e. the output of checkm .. --tab_table -f PATH ..) for defining genome quality, which is used both for filtering and to rank genomes during clustering.

+
+
+ +
+
--genome-info PATH
+

dRep style genome info table for defining quality. Used like --checkm-tab-table.

+
+
+ +
+
--min-completeness FLOAT
+

Ignore genomes with less completeness than this percentage. [default: not set]

+
+
+ +
+
--max-contamination FLOAT
+

Ignore genomes with more contamination than this percentage. [default: not set]

+
+
+ +
+
--dereplication-ani FLOAT
+

Overall ANI level to dereplicate at with FastANI. [default: 99]

+
+
+ +
+
--dereplication-aligned-fraction FLOAT
+

Min aligned fraction of two genomes for clustering. [default: 50]

+
+
+ +
+
--dereplication-fragment-length FLOAT
+

Length of fragment used in FastANI calculation (i.e. --fragLen). [default: 3000]

+
+
+ +

--dereplication-quality-formula FORMULA

+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Scoring function for genome quality [default: Parks2020_reduced]. One of:
formuladescription
Parks2020_reduced(default) A quality formula described in Parks et. al. 2020 https://doi.org/10.1038/s41587-020-0501-8 (Supplementary Table 19) but only including those scoring criteria that can be calculated from the sequence without homology searching: completeness-5*contamination-5*num_contigs/100-5*num_ambiguous_bases/100000
completeness-4contaminationcompleteness-4*contamination
completeness-5contaminationcompleteness-5*contamination
dRepcompleteness-5*contamination+contamination*(strain_heterogeneity/100)+0.5*log10(N50)
+
+
--dereplication-prethreshold-ani FLOAT
+

Require at least this dashing-derived ANI for preclustering and to avoid FastANI on distant lineages within preclusters. [default: 95]

+
+
+ +
+
--dereplication-precluster-method NAME
+

method of calculating rough ANI for dereplication. 'dashing' for HyperLogLog, 'finch' for finch MinHash. [default: dashing]

+
+
+ +
+
--dereplication-output-cluster-definition PATH
+

Output a file of representative<TAB>member lines.

+
+
+ +
+
--dereplication-output-representative-fasta-directory PATH
+

Symlink representative genomes into this directory.

+
+
+ +
+
--dereplication-output-representative-fasta-directory-copy PATH
+

Copy representative genomes into this directory.

+
+
+ +
+
--dereplication-output-representative-list PATH
+

Print newline separated list of paths to representatives into this file.

+
+
+
+
+

SHARDING

+
+
--sharded
+

If -b/--bam-files was used: Input BAM files are read-sorted alignments of a set of reads mapped to multiple reference contig sets. Choose the best hit for each read pair. Otherwise if mapping was carried out: Map reads to each reference, choosing the best hit for each pair. [default: not set]

+
+
+ +
+
--exclude-genomes-from-deshard
+

Ignore genomes whose name appears in this newline-separated file when combining shards. [default: not set]

+
+
+
+
+

MAPPING ALGORITHM OPTIONS

+

-p, --mapper NAME

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Underlying mapping software used [default: minimap2-sr]. One of:
namedescription
minimap2-srminimap2 with '-x sr' option
bwa-membwa mem using default parameters
minimap2-ontminimap2 with '-x map-ont' option
minimap2-pbminimap2 with '-x map-pb' option
minimap2-no-presetminimap2 with no '-x' option
+
+
--minimap2-params PARAMS
+

Extra parameters to provide to minimap2, both indexing command (if used) and for mapping. Note that usage of this parameter has security implications if untrusted input is specified. '-a' is always specified to minimap2. [default: none]

+
+
+ +
+
--minimap2-reference-is-index
+

Treat reference as a minimap2 database, not as a FASTA file. [default: not set]

+
+
+ +
+
--bwa-params PARAMS
+

Extra parameters to provide to BWA. Note that usage of this parameter has security implications if untrusted input is specified. [default: none]

+
+
+
+
+

ALIGNMENT THRESHOLDING

+
+
--min-read-aligned-length INT
+

Exclude reads with smaller numbers of aligned bases. [default: 0]

+
+
+ +
+
--min-read-percent-identity FLOAT
+

Exclude reads by overall percent identity e.g. 0.95 for 95%. [default: 0.0]

+
+
+ +
+
--min-read-aligned-percent FLOAT
+

Exclude reads by percent aligned bases e.g. 0.95 means 95% of the read's bases must be aligned. [default: 0.0]

+
+
+ +
+
--min-read-aligned-length-pair INT
+

Exclude pairs with smaller numbers of aligned bases. Implies --proper-pairs-only. [default: 0]

+
+
+ +
+
--min-read-percent-identity-pair FLOAT
+

Exclude pairs by overall percent identity e.g. 0.95 for 95%. Implies --proper-pairs-only. [default: 0.0]

+
+
+ +
+
--min-read-aligned-percent-pair FLOAT
+

Exclude reads by percent aligned bases e.g. 0.95 means 95% of the read's bases must be aligned. Implies --proper-pairs-only. [default: 0.0]

+
+
+ +
+
--proper-pairs-only
+

Require reads to be mapped as proper pairs. [default: not set]

+
+
+ +
+
--exclude-supplementary
+

Exclude supplementary alignments. [default: not set]

+
+
+
+
+

COVERAGE CALCULATION OPTIONS

+

-m, --methods METHOD

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Method(s) for calculating coverage [default: relative_abundance]. A more thorough description of the different methods is available at https://github.com/wwood/CoverM\#calculation-methods but briefly:
methoddescription
relative_abundance(default) Percentage relative abundance of each genome, and the unmapped read percentage
meanAverage number of aligned reads overlapping each position on the genome
trimmed_meanAverage number of aligned reads overlapping each position after removing the most deeply and shallow-ly covered positions. See --trim-min/--trim-max to adjust.
coverage_histogramHistogram of coverage depths
covered_basesNumber of bases covered by 1 or more reads
varianceVariance of coverage depths
lengthLength of each genome in base pairs
countNumber of reads aligned toq each genome. Note that a single read may be aligned to multiple genomes with supplementary alignments
reads_per_baseNumber of reads aligned divided by the length of the genome
rpkmReads mapped per kilobase of genome, per million mapped reads
+
+
--min-covered-fraction FRACTION
+

Genomes with less coverage than this reported as having zero coverage. [default: 0.10]

+
+
+ +
+
--contig-end-exclusion INT
+

Exclude bases at the ends of reference sequences from calculation [default: 75]

+
+
+ +
+
--trim-min FRACTION
+

Remove this smallest fraction of positions when calculating trimmed_mean [default: 0.05]

+
+
+ +
+
--trim-max FRACTION
+

Maximum fraction for trimmed_mean calculations [default: 0.95]

+
+
+
+
+

OUTPUT

+
+
--output-format FORMAT
+

Shape of output: 'sparse' for long format, 'dense' for species-by-site. [default: dense]

+
+
+ +
+
--no-zeros
+

Omit printing of genomes that have zero coverage. [default: not set]

+
+
+ +
+
--bam-file-cache-directory DIRECTORY
+

Output BAM files generated during alignment to this directory. The directory may or may not exist. [default: not set]

+
+
+ +
+
--discard-unmapped
+

Exclude unmapped reads from cached BAM files. [default: not set]

+
+
+
+
+

GENERAL OPTIONS

+
+
-t, --threads INT
+

Number of threads for mapping, sorting and reading. [default: 1]

+
+
+ +
+
-h, --help
+

Output a short usage message. [default: not set]

+
+
+ +
+
--full-help
+

Output a full help message and display in 'man'. [default: not set]

+
+
+ +
+
--full-help-roff
+

Output a full help message in raw ROFF format for conversion to other formats. [default: not set]

+
+
+ +
+
-v, --verbose
+

Print extra debugging information. [default: not set]

+
+
+ +
+
-q, --quiet
+

Unless there is an error, do not print log messages. [default: not set]

+
+
+
+
+

EXIT STATUS

+
+
0
+

Successful program execution.

+
+
+ +
+
1
+

Unsuccessful program execution.

+
+
+ +
+
101
+

The program panicked.

+
+
+
+
+

AUTHOR

+
+
Ben J. Woodcroft, Centre for Microbiome Research, Queensland University of Technology <benjwoodcroft near gmail.com>
+
+
+
+
+
+
+
+ + + + + + + + diff --git a/docs/coverm-make.html b/docs/coverm-make.html new file mode 100644 index 0000000..b1be786 --- /dev/null +++ b/docs/coverm-make.html @@ -0,0 +1,292 @@ + + + + + + + + + + + + + + + + +coverm make usage + + + + + + + + + + + + + + + + + + + + +
+
+
+
+coverm make usage +Ben Woodcroft, Centre for Microbiome Research, Queensland University of Technology +2020-08-28 (coverm 0.5.0) +
+
+
+ + +
+
+ + +
+
+
+

NAME

+

coverm make - Generate BAM files through mapping (version: 0.5.0)

+
+
+

SYNOPSIS

+

coverm make <REFERENCE> <READ_DEFINITION> <OUTPUT> ..

+
+
+

DESCRIPTION

+

coverm make generates BAM files by read mapping a set of reads against a reference FASTA database.

+
+
+

READ MAPPING PARAMETERS

+
+
-1 PATH ..
+

Forward FASTA/Q file(s) for mapping. These may be gzipped or not.

+
+
+ +
+
-2 PATH ..
+

Reverse FASTA/Q file(s) for mapping. These may be gzipped or not.

+
+
+ +
+
-c, --coupled PATH ..
+

One or more pairs of forward and reverse possibly gzipped FASTA/Q files for mapping in order <sample1_R1.fq.gz> <sample1_R2.fq.gz> <sample2_R1.fq.gz> <sample2_R2.fq.gz> ..

+
+
+ +
+
--interleaved PATH ..
+

Interleaved FASTA/Q files(s) for mapping. These may be gzipped or not.

+
+
+ +
+
--single PATH ..
+

Unpaired FASTA/Q files(s) for mapping. These may be gzipped or not.

+
+
+
+
+

REFERENCE

+
+
-r, --reference PATH
+

FASTA file of contigs e.g. concatenated genomes or metagenome assembly, or minimap2 index (with --minimap2-reference-is-index), or BWA index stem (with -p bwa-mem). [required]

+
+
+
+
+

MAPPING ALGORITHM OPTIONS

+

-p, --mapper NAME

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Underlying mapping software used [default: minimap2-sr]. One of:
namedescription
minimap2-srminimap2 with '-x sr' option
bwa-membwa mem using default parameters
minimap2-ontminimap2 with '-x map-ont' option
minimap2-pbminimap2 with '-x map-pb' option
minimap2-no-presetminimap2 with no '-x' option
+
+
--minimap2-params PARAMS
+

Extra parameters to provide to minimap2, both indexing command (if used) and for mapping. Note that usage of this parameter has security implications if untrusted input is specified. '-a' is always specified to minimap2. [default: none]

+
+
+ +
+
--minimap2-reference-is-index
+

Treat reference as a minimap2 database, not as a FASTA file. [default: not set]

+
+
+ +
+
--bwa-params PARAMS
+

Extra parameters to provide to BWA. Note that usage of this parameter has security implications if untrusted input is specified. [default: none]

+
+
+
+
+

OUTPUT

+
+
-o, --output-directory DIR
+

Where generated BAM files will go. The directory will be created if it does not exist. [required]

+
+
+ +
+
--discard-unmapped
+

Exclude unmapped reads from cached BAM files. [default: not set]

+
+
+
+
+

GENERAL OPTIONS

+
+
-t, --threads INT
+

Number of threads for mapping and sorting. [default: 1]

+
+
+ +
+
-h, --help
+

Output a short usage message. [default: not set]

+
+
+ +
+
--full-help
+

Output a full help message and display in 'man'. [default: not set]

+
+
+ +
+
--full-help-roff
+

Output a full help message in raw ROFF format for conversion to other formats. [default: not set]

+
+
+ +
+
-v, --verbose
+

Print extra debugging information. [default: not set]

+
+
+ +
+
-q, --quiet
+

Unless there is an error, do not print log messages. [default: not set]

+
+
+
+
+

EXIT STATUS

+
+
0
+

Successful program execution.

+
+
+ +
+
1
+

Unsuccessful program execution.

+
+
+ +
+
101
+

The program panicked.

+
+
+
+
+

AUTHOR

+
+
Ben J. Woodcroft, Centre for Microbiome Research, Queensland University of Technology <benjwoodcroft near gmail.com>
+
+
+
+
+
+
+
+ + + + + + + + diff --git a/prelude b/prelude new file mode 100644 index 0000000..61e9d0f --- /dev/null +++ b/prelude @@ -0,0 +1,10 @@ +--- +title: "coverm SUBCOMMAND usage" +author: "Ben Woodcroft, Centre for Microbiome Research, Queensland University of Technology" +date: "`r Sys.Date()` (`r system('cargo run -- --version',intern=T)`)" +output: + prettydoc::html_pretty: + theme: leonids + highlight: github + toc: true +--- diff --git a/release.sh b/release.sh index a6db14f..07c0922 100755 --- a/release.sh +++ b/release.sh @@ -2,26 +2,27 @@ set -o pipefail -export LIBCLANG_PATH=~/.guix-profile/lib +echo "Building normally .." +cargo build --release export VERSION=`cargo run -- --version |awk '{print $2}'` # For minimap header fix binary -export PATH=target/debug:$PATH +export PATH=target/release:$PATH echo "Found version $VERSION .." -echo "Building normally .." -cargo build --release + echo "Testing release version .." cargo test --release echo "Building musl static binary .." -cargo build --target x86_64-unknown-linux-musl --release +# Use cross not cargo here so htslib is OK - see https://github.com/rust-bio/rust-htslib +cross build --target x86_64-unknown-linux-musl --release echo "Making static dist .." -mkdir dist/coverm-x86_64-unknown-linux-musl-$VERSION +mkdir -p dist/coverm-x86_64-unknown-linux-musl-$VERSION cp \ target/x86_64-unknown-linux-musl/release/coverm \ target/x86_64-unknown-linux-musl/release/remove_minimap2_duplicated_headers \ @@ -31,4 +32,15 @@ cd dist tar czf coverm-x86_64-unknown-linux-musl-$VERSION.tar.gz coverm-x86_64-unknown-linux-musl-$VERSION cd .. -echo "Now make sure git is up to date, and run LIBCLANG_PATH=~/.guix-profile/lib cargo publish" +echo "Building HTML versions of man pages .." +for SUBCOMMAND in genome cluster contig filter make +do + echo "Documenting $SUBCOMMAND .." + cargo run -- $SUBCOMMAND --full-help-roff |pandoc - -t markdown -f man |sed 's/\\\[/[/g; s/\\\]/]/g' |cat <(sed s/SUBCOMMAND/$SUBCOMMAND/ prelude) - >docs/coverm-$SUBCOMMAND.Rmd + echo "library(prettydoc); setwd('docs'); rmarkdown::render('coverm-$SUBCOMMAND.Rmd','prettydoc::html_pretty','coverm-$SUBCOMMAND.html')" |R --no-save + rm docs/coverm-$SUBCOMMAND.Rmd + echo "Finished documenting $SUBCOMMAND" +done + + +echo "Now make sure git is up to date, the documentation HTML has been properly generated and run cargo publish" diff --git a/src/bin/coverm.rs b/src/bin/coverm.rs index 07ebc8d..f38c604 100644 --- a/src/bin/coverm.rs +++ b/src/bin/coverm.rs @@ -630,7 +630,11 @@ fn main() { ); // Where write the completions to } Some("cluster") => { - galah::cluster_argument_parsing::run_cluster_subcommand(&matches); + galah::cluster_argument_parsing::run_cluster_subcommand( + &matches, + "coverm", + crate_version!(), + ); } _ => { app.print_help().unwrap(); diff --git a/src/cli.rs b/src/cli.rs index 14b61b4..12e51af 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,6 +1,8 @@ +use bird_tool_utils::clap_utils::{default_roff, monospace_roff}; +use bird_tool_utils_man::prelude::{Author, Flag, Manual, Opt, Section}; use clap::*; use galah::cluster_argument_parsing::GalahClustererCommandDefinition; -use man::prelude::{Author, Flag, Manual, Opt, Section}; +use roff::bold; const MAPPING_SOFTWARE_LIST: &[&str] = &[ "bwa-mem", @@ -35,23 +37,42 @@ lazy_static! { fn add_mapping_options(manual: Manual) -> Manual { manual.custom( Section::new("Mapping algorithm options") - .option(Opt::new("NAME").short("-p").long("--mapper").help( - "Underlying mapping software used \ - (\"minimap2-sr\", \"bwa-mem\", \"minimap2-ont\", \ - \"minimap2-pb\", or \"minimap2-no-preset\"). \ - minimap2 -sr, -ont, -pb, -no-preset specify \ - '-x' preset of minimap2 to be used \ - (with map-ont, map-pb for -ont, -pb). \ - [default: minimap2-sr]", - )) - .option(Opt::new("PARAMS").long("--minimap2-params").help( + .option(Opt::new("NAME").short("-p").long("--mapper").help(&format!( + "Underlying mapping software used {}. One of: {}", + default_roff("minimap2-sr"), + bird_tool_utils::clap_utils::table_roff(&[ + &["name", "description"], + &[ + &monospace_roff("minimap2-sr"), + &format!("minimap2 with '{}' option", &monospace_roff("-x sr")) + ], + &[ + &monospace_roff("bwa-mem"), + &format!("bwa mem using default parameters") + ], + &[ + &monospace_roff("minimap2-ont"), + &format!("minimap2 with '{}' option", &monospace_roff("-x map-ont")) + ], + &[ + &monospace_roff("minimap2-pb"), + &format!("minimap2 with '{}' option", &monospace_roff("-x map-pb")) + ], + &[ + &monospace_roff("minimap2-no-preset"), + &format!("minimap2 with no '{}' option", &monospace_roff("-x")) + ], + ]) + ))) + .option(Opt::new("PARAMS").long("--minimap2-params").help(&format!( "Extra parameters to provide to minimap2, \ both indexing command (if used) and for \ mapping. Note that usage of this parameter \ has security implications if untrusted input \ - is specified. '-a' is always specified. \ - [default: none i.e. \"\"]", - )) + is specified. '{}' is always specified to minimap2. \ + [default: none]", + &monospace_roff("-a") + ))) .flag(Flag::new().long("--minimap2-reference-is-index").help( "Treat reference as a minimap2 database, not as a FASTA file. [default: not set]", )) @@ -59,7 +80,7 @@ fn add_mapping_options(manual: Manual) -> Manual { "Extra parameters to provide to BWA. Note \ that usage of this parameter has security \ implications if untrusted input is specified. \ - [default: none i.e. \"\"]", + [default: none]", )), ) } @@ -67,42 +88,64 @@ fn add_mapping_options(manual: Manual) -> Manual { fn add_thresholding_options(manual: Manual) -> Manual { manual.custom( Section::new("Alignment thresholding") - .option(Opt::new("INT").long("--min-read-aligned-length").help( - "Exclude reads with smaller numbers of \ - aligned bases [default: 0]", - )) - .option(Opt::new("FLOAT").long("--min-read-percent-identity").help( - "Exclude reads by overall percent \ - identity e.g. 0.95 for 95%. [default: 0.0]", - )) - .option(Opt::new("FLOAT").long("--min-read-aligned-percent").help( - "Exclude reads by percent aligned \ + .option( + Opt::new("INT") + .long("--min-read-aligned-length") + .help(&format!( + "Exclude reads with smaller numbers of \ + aligned bases. {}", + default_roff("0") + )), + ) + .option( + Opt::new("FLOAT") + .long("--min-read-percent-identity") + .help(&format!( + "Exclude reads by overall percent \ + identity e.g. 0.95 for 95%. {}", + default_roff("0.0") + )), + ) + .option( + Opt::new("FLOAT") + .long("--min-read-aligned-percent") + .help(&format!( + "Exclude reads by percent aligned \ bases e.g. 0.95 means 95% of the read's \ - bases must be aligned. [default: 0.0]", - )) - .option(Opt::new("INT").long("--min-read-aligned-length-pair").help( - "Exclude pairs with smaller numbers of \ + bases must be aligned. {}", + default_roff("0.0") + )), + ) + .option( + Opt::new("INT") + .long("--min-read-aligned-length-pair") + .help(&format!( + "Exclude pairs with smaller numbers of \ aligned bases. \ - Implies --proper-pairs-only. [default: 0]", - )) + Implies --proper-pairs-only. {}", + default_roff("0") + )), + ) .option( Opt::new("FLOAT") .long("--min-read-percent-identity-pair") - .help( + .help(&format!( "Exclude pairs by overall percent \ identity e.g. 0.95 for 95%. \ - Implies --proper-pairs-only. [default: 0.0]", - ), + Implies --proper-pairs-only. {}", + default_roff("0.0") + )), ) .option( Opt::new("FLOAT") .long("--min-read-aligned-percent-pair") - .help( + .help(&format!( "Exclude reads by percent aligned \ bases e.g. 0.95 means 95% of the read's \ bases must be aligned. \ - Implies --proper-pairs-only. [default: 0.0]", - ), + Implies --proper-pairs-only. {}", + default_roff("0.0") + )), ) .flag( Flag::new() @@ -185,15 +228,16 @@ fn add_help_options_to_section(section: Section) -> Section { } fn sharding_section() -> Section { - Section::new("Sharding").flag(Flag::new().long("--sharded").help( - "If -b/--bam-files was used: \ + Section::new("Sharding").flag(Flag::new().long("--sharded").help(&format!( + "If {} was used: \ Input BAM files are read-sorted alignments \ of a set of reads mapped to multiple \ reference contig sets. Choose the best \ hit for each read pair. Otherwise if mapping was carried out: \ Map reads to each reference, choosing the \ best hit for each pair. [default: not set]", - )) + monospace_roff("-b/--bam-files") + ))) } fn add_verbosity_flags(manual: Manual) -> Manual { @@ -225,8 +269,11 @@ fn add_verbosity_flags_to_section(section: Section) -> Section { pub fn filter_full_help() -> Manual { let mut manual = Manual::new("coverm filter") - .about("Threshold alignments with insufficient identity") - .author(Author::new("Ben J Woodcroft").email("benjwoodcroft near gmail.com")) + .about(&format!( + "Threshold alignments with insufficient identity (version {})", + crate_version!() + )) + .author(Author::new(crate::AUTHOR).email("benjwoodcroft near gmail.com")) .description( "Only primary, non-supplementary alignments are considered, and output files \ are grouped by reference, but not sorted by position.", @@ -244,12 +291,10 @@ pub fn filter_full_help() -> Manual { .help(" Path to corresponding output file(s). [required]"), ); manual = add_thresholding_options(manual); - manual = manual.option( - Opt::new("INT") - .short("-t") - .long("--threads") - .help("Number of threads for output compression. [default: 1]"), - ); + manual = manual.option(Opt::new("INT").short("-t").long("--threads").help(&format!( + "Number of threads for output compression. {}", + default_roff("1") + ))); manual = manual.flag(Flag::new().long("--inverse").help( "Only keep reads which are unmapped or \ align below thresholds. Note that output \ @@ -264,8 +309,12 @@ pub fn filter_full_help() -> Manual { pub fn make_full_help() -> Manual { let mut manual = Manual::new("coverm make") - .about("Generate BAM files through mapping") - .author(Author::new("Ben J Woodcroft").email("benjwoodcroft near gmail.com")) + .about(&format!( + "Generate BAM files through mapping (version: {})", + crate_version!() + )) + .custom_synopsis_expansion(" ..") + .author(Author::new(crate::AUTHOR).email("benjwoodcroft near gmail.com")) .description( "coverm make generates BAM files by read mapping a set of reads against \ a reference FASTA database.\n\n", @@ -273,15 +322,19 @@ pub fn make_full_help() -> Manual { manual = manual.custom(read_mapping_params_section()); - manual = manual.custom(Section::new("Reference").option( - Opt::new("PATH").short("-r").long("--reference").help( - "FASTA file of contigs e.g. concatenated \ + manual = manual.custom( + Section::new("Reference").option(Opt::new("PATH").short("-r").long("--reference").help( + &format!( + "FASTA file of contigs e.g. concatenated \ genomes or metagenome assembly, or minimap2 \ index \ - (with --minimap2-reference-is-index), \ - or BWA index stem (with -p bwa-mem). [required]", - ), - )); + (with {}), \ + or BWA index stem (with {}). [required]", + monospace_roff("--minimap2-reference-is-index"), + monospace_roff("-p bwa-mem"), + ), + )), + ); manual = add_mapping_options(manual); @@ -297,10 +350,10 @@ pub fn make_full_help() -> Manual { )); let mut general_section = Section::new("General options").option( - Opt::new("INT") - .short("-t") - .long("--threads") - .help("Number of threads for mapping and sorting. [default: 1]"), + Opt::new("INT").short("-t").long("--threads").help(&format!( + "Number of threads for mapping and sorting. {}", + default_roff("1") + )), ); general_section = add_help_options_to_section(general_section); general_section = add_verbosity_flags_to_section(general_section); @@ -311,36 +364,52 @@ pub fn make_full_help() -> Manual { pub fn contig_full_help() -> Manual { let mut manual = Manual::new("coverm contig") - .about("Calculate read coverage per-contig") - .author(Author::new("Ben J Woodcroft").email("benjwoodcroft near gmail.com")) + .about(format!("Calculate read coverage per-contig (version {})",crate_version!())) + .custom_synopsis_expansion(" ..") + .author(Author::new(crate::AUTHOR).email("benjwoodcroft near gmail.com")) .description("coverm contig calculates the coverage of a set of reads on a set of contigs.\n\n\ This process can be undertaken in several ways, for instance by specifying BAM files or raw reads as input, \ using different mapping programs, thresholding read alignments, using different methods of calculating coverage \ - and printing the calculated coverage in various formats."); + and printing the calculated coverage in various formats.\n\ + \n\ + The source code for CoverM is available at https://github.com/wwood/CoverM"); - manual = manual.custom(read_mapping_params_section().option( - Opt::new("PATH").short("-b").long("--bam-files").help( - "Path to BAM file(s). These must be \ + manual = manual.custom( + read_mapping_params_section().option( + Opt::new("PATH") + .short("-b") + .long("--bam-files") + .help(&format!( + "Path to BAM file(s). These must be \ reference sorted (e.g. with samtools sort) \ - unless --sharded is specified, in which \ + unless {} is specified, in which \ case they must be read name sorted (e.g. \ - with samtools sort -n). When specified, no read mapping algorithm is undertaken.", + with {}). When specified, no read mapping algorithm is undertaken.", + monospace_roff("--sharded"), + monospace_roff("samtools sort -n"), + )), ), - )); + ); - manual = manual.custom(Section::new("Reference").option( - Opt::new("PATH").short("-r").long("--reference").help( - "FASTA file of contigs e.g. concatenated \ + manual = manual.custom( + Section::new("Reference").option(Opt::new("PATH").short("-r").long("--reference").help( + &format!( + "FASTA file of contigs e.g. concatenated \ genomes or metagenome assembly, or minimap2 \ index \ - (with --minimap2-reference-is-index), \ - or BWA index stem (with -p bwa-mem). \ + (with {}), \ + or BWA index stem (with {}). \ If multiple references FASTA files are \ - provided and --sharded is specified, \ + provided and {} is specified, \ then reads will be mapped to references \ - separately as sharded BAMs. [required]", - ), - )); + separately as sharded BAMs. [required unless {} is specified]", + monospace_roff("--minimap2-reference-is-index"), + monospace_roff("-p bwa-mem"), + monospace_roff("--sharded"), + monospace_roff("-b/--bam-files") + ), + )), + ); manual = manual.custom(sharding_section()); manual = add_mapping_options(manual); @@ -349,40 +418,45 @@ pub fn contig_full_help() -> Manual { manual = manual.custom( Section::new("Coverage calculation options") .option(Opt::new("METHOD").short("-m").long("--methods").help( - "Method(s) for calculating coverage. \ - One or more (space separated) of: \ - mean (default), \ - trimmed_mean, \ - coverage_histogram, \ - covered_fraction, \ - covered_bases, \ - variance, \ - length, \ - count, \ - metabat (\"MetaBAT adjusted coverage\"), \ - reads_per_base, \ - rpkm. \ - A more thorough description of the different \ - methods is available at \ - https://github.com/wwood/CoverM [default: mean]", - )) + &format!("Method(s) for calculating coverage {}. A more thorough description of the different methods is available at\n\ + https://github.com/wwood/CoverM#calculation-methods but briefly:\n\ + {}", + default_roff("mean"), + bird_tool_utils::clap_utils::table_roff(&[ + &["method","description"], + &[&monospace_roff("mean"), "(default) Average number of aligned reads overlapping each position on the contig"], + &[&monospace_roff("trimmed_mean"), &format!("Average number of aligned reads overlapping each position after removing the most deeply and shallow-ly covered positions. See {}/{} to adjust.", + &monospace_roff("--trim-min"), + &monospace_roff("--trim-max"))], + + &[&monospace_roff("coverage_histogram"), "Histogram of coverage depths"], + &[&monospace_roff("covered_bases"), "Number of bases covered by 1 or more reads"], + &[&monospace_roff("variance"), "Variance of coverage depths"], + &[&monospace_roff("length"), "Length of each contig in base pairs"], + &[&monospace_roff("count"), "Number of reads aligned toq each contig. Note that a single read may be aligned to multiple contigs with supplementary alignments"], + &[&monospace_roff("metabat"), "(\"MetaBAT adjusted coverage\") Coverage as defined in Kang et al 2015 https://doi.org/10.7717/peerj.1165"], + &[&monospace_roff("reads_per_base"), "Number of reads aligned divided by the length of the contig"], + &[&monospace_roff("rpkm"), "Reads mapped per kilobase of contig, per million mapped reads"], + ]), + ))) .option(Opt::new("FRACTION").long("--min-covered-fraction").help( - "Genomes with less coverage than this \ - reported as having zero coverage. \ - [default: 0.10]", + &format!("Genomes with less coverage than this \ + reported as having zero coverage. \ + {}", default_roff("0.10")) )) .option(Opt::new("INT").long("--contig-end-exclusion").help( - "Exclude bases at the ends of reference \ - sequences from calculation [default: 75]", + &format!("Exclude bases at the ends of reference \ + sequences from calculation {}", + default_roff("75")) )) .option(Opt::new("FRACTION").long("--trim-min").help( - "Remove this smallest fraction of positions \ - when calculating trimmed_mean \ - [default: 0.05]", + &format!("Remove this smallest fraction of positions \ + when calculating trimmed_mean {}", + default_roff("0.05")) )) .option(Opt::new("FRACTION").long("--trim-max").help( - "Maximum fraction for trimmed_mean \ - calculations [default: 0.95]", + &format!("Maximum fraction for trimmed_mean \ + calculations {}", default_roff("0.95")) )), ); @@ -428,58 +502,82 @@ pub fn contig_full_help() -> Manual { pub fn genome_full_help() -> Manual { let mut manual = Manual::new("coverm genome") - .about("Calculate read coverage per-genome") - .author(Author::new("Ben J Woodcroft").email("benjwoodcroft near gmail.com")) + .about(format!("Calculate read coverage per-genome (version {})",crate_version!())) + .custom_synopsis_expansion(" ..") + .author(Author::new(crate::AUTHOR).email("benjwoodcroft near gmail.com")) .description("coverm genome calculates the coverage of a set of reads on a set of genomes.\n\n\ This process can be undertaken in several ways, for instance by specifying BAM files or raw reads as input, \ defining genomes in different input formats, dereplicating genomes before mapping, \ using different mapping programs, thresholding read alignments, using different methods of calculating coverage \ - and printing the calculated coverage in various formats."); + and printing the calculated coverage in various formats.\n\ + \n\ + The source code for CoverM is available at https://github.com/wwood/CoverM"); - manual = manual.custom(read_mapping_params_section().option( - Opt::new("PATH").short("-b").long("--bam-files").help( - "Path to BAM file(s). These must be \ + manual = manual.custom( + read_mapping_params_section().option( + Opt::new("PATH") + .short("-b") + .long("--bam-files") + .help(&format!( + "Path to BAM file(s). These must be \ reference sorted (e.g. with samtools sort) \ - unless --sharded is specified, in which \ + unless {} is specified, in which \ case they must be read name sorted (e.g. \ - with samtools sort -n). When specified, no read mapping algorithm is undertaken.", + with {}). When specified, no read mapping algorithm is undertaken.", + monospace_roff("--sharded"), + monospace_roff("samtools sort -n") + )), ), - )); + ); manual = manual.custom( bird_tool_utils::clap_utils::add_genome_specification_to_section( Section::new("Genome definition")) .option( Opt::new("PATH").short("-r").long("--reference").help( - "FASTA file of contigs e.g. concatenated \ + &format!("FASTA file of contigs e.g. concatenated \ genomes or metagenome assembly, or minimap2 \ index \ - (with --minimap2-reference-is-index), \ - or BWA index stem (with -p bwa-mem). \ + (with {}), \ + or BWA index stem (with {}). \ If multiple references FASTA files are \ - provided and --sharded is specified, \ + provided and {} is specified, \ then reads will be mapped to references \ - separately as sharded BAMs.", + separately as sharded BAMs. {}: If genomic FASTA files are \ + specified elsewhere (e.g. with {} or {}), then {} is not needed as a reference FASTA file can be derived \ + by concatenating input genomes. However, while not necessary, {} can \ + be specified if an alternate reference sequence set is desired.", + monospace_roff("--minimap2-reference-is-index"), + monospace_roff("-p bwa-mem"), + monospace_roff("--sharded"), + bold("NOTE"), + monospace_roff("--genome-fasta-files"), + monospace_roff("--genome-fasta-directory"), + //monospace_roff("--bam-files"), + monospace_roff("--reference"), + monospace_roff("--reference"), ) ) + ) .option( Opt::new("CHARACTER") .short("-s") .long("--separator") - .help("This character separates genome names from contig names in the reference file. Requires --reference. \ - [default: unspecified]") + .help( + &format!("This character separates genome names from contig names in the reference file. Requires {}. \ + [default: unspecified]", monospace_roff("--reference"))) ) .flag( Flag::new() .long("--single-genome") - .help("All contigs are from the same genome. Requires --reference. [default: not set]") + .help(&format!("All contigs are from the same genome. Requires {}. [default: not set]", monospace_roff("--reference"))) ) .option( Opt::new("FILE") .long("--genome-definition") - .help("File containing list of \ - genome_namecontig lines to define the genome of each contig. Requires --reference. [default: not set]") + .help(&format!("File containing list of \ + genome_namecontig lines to define the genome of each contig. Requires {}. [default: not set]", monospace_roff("--reference"))) ) ); @@ -490,7 +588,7 @@ pub fn genome_full_help() -> Manual { all within a small distance, using Dashing for \ preclustering and FastANI for final ANI \ calculation. When this flag is used, dereplication occurs \ - transparently through the Galah method (https://github.com/wwood/galah) [default: not set i.e. no dereplication]", + transparently through the Galah method (https://github.com/wwood/galah) [default: not set]", ), ); derep_section = @@ -524,49 +622,52 @@ pub fn genome_full_help() -> Manual { manual = manual.custom( Section::new("Coverage calculation options") .option(Opt::new("METHOD").short("-m").long("--methods").help( - "Method(s) for calculating coverage. \ - One or more (space separated) of: \ - relative_abundance (default), \ - mean, \ - trimmed_mean, \ - coverage_histogram, \ - covered_fraction, \ - covered_bases, \ - variance, \ - length, \ - count, \ - reads_per_base, \ - rpkm. \ - A more thorough description of the different \ - methods is available at \ - https://github.com/wwood/CoverM [default: relative_abundance]", - )) + &format!("Method(s) for calculating coverage {}. A more thorough description of the different methods is available at\n\ + https://github.com/wwood/CoverM#calculation-methods but briefly:\n\ + {}", + default_roff("relative_abundance"), + bird_tool_utils::clap_utils::table_roff(&[ + &["method","description"], + &[&monospace_roff("relative_abundance"), "(default) Percentage relative abundance of each genome, and the unmapped read percentage"], + &[&monospace_roff("mean"), "Average number of aligned reads overlapping each position on the genome"], + &[&monospace_roff("trimmed_mean"), &format!("Average number of aligned reads overlapping each position after removing the most deeply and shallow-ly covered positions. See {}/{} to adjust.", + &monospace_roff("--trim-min"), + &monospace_roff("--trim-max"))], + &[&monospace_roff("coverage_histogram"), "Histogram of coverage depths"], + &[&monospace_roff("covered_bases"), "Number of bases covered by 1 or more reads"], + &[&monospace_roff("variance"), "Variance of coverage depths"], + &[&monospace_roff("length"), "Length of each genome in base pairs"], + &[&monospace_roff("count"), "Number of reads aligned toq each genome. Note that a single read may be aligned to multiple genomes with supplementary alignments"], + &[&monospace_roff("reads_per_base"), "Number of reads aligned divided by the length of the genome"], + &[&monospace_roff("rpkm"), "Reads mapped per kilobase of genome, per million mapped reads"], + ]) + ))) .option(Opt::new("FRACTION").long("--min-covered-fraction").help( - "Genomes with less coverage than this \ - reported as having zero coverage. \ - [default: 0.10]", + &format!("Genomes with less coverage than this \ + reported as having zero coverage. \ + {}", default_roff("0.10")) )) .option(Opt::new("INT").long("--contig-end-exclusion").help( - "Exclude bases at the ends of reference \ - sequences from calculation [default: 75]", + &format!("Exclude bases at the ends of reference \ + sequences from calculation {}", + default_roff("75")) )) .option(Opt::new("FRACTION").long("--trim-min").help( - "Remove this smallest fraction of positions \ - when calculating trimmed_mean \ - [default: 0.05]", + &format!("Remove this smallest fraction of positions \ + when calculating trimmed_mean {}", + default_roff("0.05")) )) .option(Opt::new("FRACTION").long("--trim-max").help( - "Maximum fraction for trimmed_mean \ - calculations [default: 0.95]", + &format!("Maximum fraction for trimmed_mean \ + calculations {}", default_roff("0.95")) )), ); manual = manual.custom( Section::new("Output") .option(Opt::new("FORMAT").long("--output-format").help( - "Shape of output: 'sparse' for long format, \ - 'dense' for species-by-site. \ - [default: dense]", + &format!("Shape of output: 'sparse' for long format, \ + 'dense' for species-by-site. {}", default_roff("dense")) )) .flag(Flag::new().long("--no-zeros").help( "Omit printing of genomes that have zero \ @@ -588,10 +689,10 @@ pub fn genome_full_help() -> Manual { ); let mut general_section = Section::new("General options").option( - Opt::new("INT") - .short("-t") - .long("--threads") - .help("Number of threads for mapping, sorting and reading. [default: 1]"), + Opt::new("INT").short("-t").long("--threads").help(&format!( + "Number of threads for mapping, sorting and reading. {}", + default_roff("1") + )), ); general_section = add_help_options_to_section(general_section); general_section = add_verbosity_flags_to_section(general_section); @@ -713,7 +814,7 @@ See coverm make --full-help for further options and further detail. let mut app = App::new("coverm") .version(crate_version!()) - .author("Ben J. Woodcroft ") + .author(crate::AUTHOR_AND_EMAIL) .about("Mapping coverage analysis for metagenomics") .args_from_usage( "-v, --verbose 'Print extra debug logging information' @@ -1432,13 +1533,14 @@ Ben J. Woodcroft .about("Remove alignments with insufficient identity") .help(FILTER_HELP.as_str()) .arg(Arg::with_name("full-help").long("full-help")) + .arg(Arg::with_name("full-help-roff").long("full-help-roff")) .arg( Arg::with_name("bam-files") .short("b") .long("bam-files") .multiple(true) .takes_value(true) - .required_unless_one(&["full-help"]), + .required_unless_one(&["full-help", "full-help-roff"]), ) .arg( Arg::with_name("output-bam-files") @@ -1446,7 +1548,7 @@ Ben J. Woodcroft .long("output-bam-files") .multiple(true) .takes_value(true) - .required_unless_one(&["full-help"]), + .required_unless_one(&["full-help", "full-help-roff"]), ) .arg(Arg::with_name("inverse").long("inverse")) .arg( diff --git a/src/lib.rs b/src/lib.rs index 9c8d972..ee7bbec 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,12 +30,18 @@ extern crate clap; #[macro_use] extern crate lazy_static; extern crate bird_tool_utils; +extern crate bird_tool_utils_man; extern crate galah; -extern crate man; +extern crate roff; extern crate version_compare; pub const CONCATENATED_FASTA_FILE_SEPARATOR: &str = "~"; +pub const AUTHOR: &str = + "Ben J. Woodcroft, Centre for Microbiome Research, Queensland University of Technology"; +pub const AUTHOR_AND_EMAIL: &str = + "Ben J. Woodcroft, Centre for Microbiome Research, Queensland University of Technology "; + #[derive(PartialEq, Debug, Clone)] pub struct ReadsMapped { num_mapped_reads: u64,