Skip to content

Commit

Permalink
docu
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Feb 22, 2022
1 parent e6f581e commit 43f205e
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 77 deletions.
28 changes: 28 additions & 0 deletions include/bio/map_io/all.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
// Copyright (c) 2020-2021, deCODE Genetics
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE
// -----------------------------------------------------------------------------------------------------

/*!\file
* \brief Meta-include that includes the whole Variant I/O module.
* \author Svenja Mehringer <svenja.mehringer AT fu-berlin.de>
*/

#pragma once

/*!\defgroup map_io Map I/O
* \ingroup bio
* \brief Reader and writer for SAM and BAM files.
*
* This module provides high-level APIs to read and write SAM and BAM files.
*
* To read files, have a look at bio::map_io::reader and to write files have a look at bio::map_io::writer.
*
*/

/*!\namespace bio::map_io
* \brief Namespace for the Map I/O module.
*/
62 changes: 35 additions & 27 deletions include/bio/map_io/header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,15 @@ namespace bio::map_io
* \ingroup map_io
* \details
*
* TODO
* Each header line begins with the character `@` followed by one of the two-letter header record type codes
* defined in this section. In the header, each line is tab-delimited and, apart from `@CO` lines, each data field
* follows a format `TAG:VALUE` where TAG is a two-character string that defines the format and content of
* VALUE. Thus header lines match `/^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/` or are comment lines staring
* with `@CO` followed by a tab and any character sequence.
* Within each (non-`@CO`) header line, no field tag may appear more than once and the order in which the fields
* appear is not significant.
*
* \sa https://samtools.github.io/hts-specs/SAMv1.pdf
*/
class header
{
Expand All @@ -57,7 +65,6 @@ class header
header & operator=(header &&) = default; //!< Defaulted.

/*!\brief Construct from a range of reference ids.
* \param[in] The plain text header.
* \param[in] ref_ids The range over reference ids to redirect the pointer at.
*/
template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
Expand Down Expand Up @@ -95,6 +102,7 @@ class header
//!\brief The reference sequence names.
std::vector<std::string_view> reference_names;

//!\brief Additional information to the reference sequence (same ordering as `reference_names`).
std::vector<std::tuple<int32_t, std::string>> reference_names_info{};

//!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
Expand All @@ -103,6 +111,7 @@ class header
//!\brief Whether reference sequence names were given to the header on construction.
bool reference_names_given_on_construction{false};

//!\brief Print a B.I.O warning message with current line number in diagnostic.
/* [[noreturn]] compiler says this returns something...? */ void warning(auto const &... messages) const
{
// if (print_warnings)
Expand All @@ -114,22 +123,22 @@ class header
// }
}
public:
/*!\name [@HD] File-level meta data
/*!\name [HD] File-level meta data
* \brief You can directly edit these member variables.
* \{
*/
std::string format_version{}; //!< [@HD VN] The file format version. Note: this is overwritten by our formats on output.
std::string sorting{}; //!< [@HD SO] The sorting of the file. SAM: [unknown, unsorted, queryname, coordinate].
std::string grouping{}; //!< [@HD GO] The grouping of the file. SAM: [none, query, reference].
std::string subsorting{}; //!< [@HD SS] The sub-sorting of the file. SAM: [unknown, unsorted, queryname, coordinate](:[A-Za-z0-9_-]+)+.
std::string format_version{}; //!< [HD VN] The file format version. Note: this is overwritten by our formats on output.
std::string sorting{}; //!< [HD SO] The sorting of the file. SAM: [unknown, unsorted, queryname, coordinate].
std::string grouping{}; //!< [HD GO] The grouping of the file. SAM: [none, query, reference].
std::string subsorting{}; //!< [HD SS] The sub-sorting of the file. SAM: [unknown, unsorted, queryname, coordinate]`(:[A-Za-z0-9_-]+)+`.
//!\}

/*!\name [@SQ] Reference sequence dictionary
/*!\name [SQ] Reference sequence dictionary
* \brief You **CANNOT** directly edit these member variables. Please use the respective modifiers.
* \{
*/

/*!\brief [@SQ SN] Reference sequence names
/*!\brief [SQ SN] Reference sequence names
*
* \details
*
Expand All @@ -139,11 +148,11 @@ class header
* 1) Reference id information is provided on construction. In this case, no copy is made but this function
* gives you a reference to the provided range. When reading the header or the records, their reference
* information will be checked against the given input.
* 2) No reference information is provided on construction but the \@SQ tags are present in the header.
* 2) No reference information is provided on construction but the `@SQ` tags are present in the header.
* In this case, the reference id information is extracted from the header and this member function provides
* access to them. When reading the records, their reference id information will be checked against the header
* information.
* 3) No reference information is provided on construction an no \@SQ tags are present in the header.
* 3) No reference information is provided on construction an no `@SQ` tags are present in the header.
* In this case, the reference information is parsed from the records field::ref_id and stored in the header.
* This member function then provides access to the unique list of reference names encountered in the records.
*/
Expand All @@ -152,11 +161,11 @@ class header
return reference_names;
}

/*!\brief [@SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
/*!\brief [SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
*
* \details
*
* The reference information store the length (\@LN tag) and
* The reference information store the length (`@LN` tag) and
* additional information of each reference sequence in the file. The record
* must then store only the index of the reference.
* The name and length information are required if the header is provided
Expand All @@ -166,17 +175,17 @@ class header
*
* The additional information (2nd tuple entry) must model
* the following formatting rules: The information is given in a tab separated
* TAG:VALUE format, where TAG must be one of [AH, AN, AS, m5, SP, UR].
* `TAG:VALUE` format, where TAG must be one of [AH, AN, AS, m5, SP, UR].
* The following information and rules apply for each tag (taken from the SAM specs):
*
* * **AH:** Indicates that this sequence is an alternate locus. The value is the locus in the primary assembly for
* which this sequence is an alternative, in the format 'chr:start-end', 'chr' (if known), or '*' (if
* unknown), where 'chr' is a sequence in the primary assembly. Must not be present on sequences in the
* which this sequence is an alternative, in the format `chr:start-end`, `chr` (if known), or `*` (if
* unknown), where `chr` is a sequence in the primary assembly. Must not be present on sequences in the
* primary assembly.
* * **AN:** Alternative reference sequence names. A comma-separated list of alternative names that tools may use
* when referring to this reference sequence. These alternative names are not used elsewhere within the
* SAM file; in particular, they must not appear in alignment records’ RNAME or RNEXT fields. regular
* expression : name (, name )* where name is [0-9A-Za-z][0-9A-Za-z*+.@ \|-]*
* expression : `name (, name )*` where name is `[0-9A-Za-z][0-9A-Za-z*+.@ \|-]*`.
* * **AS:** Genome assembly identifier.
* * **M5:** MD5 checksum of the sequence. See Section 1.3.1
* * **SP:** Species.
Expand Down Expand Up @@ -204,7 +213,7 @@ class header
}
//!\}

/*!\name [@RG] Read groups
/*!\name [RG] Read groups
* \brief You can directly edit these member variables.
* \{
*/
Expand All @@ -215,7 +224,7 @@ class header
* The read group list stores the group id and
* additional information of each read group in the file. The record
* may store a RG tag information referencing one of the stored id's.
* The id information is required if the @RG header line is provided.
* The id information is required if the \@RG header line is provided.
*
* The additional information (2nd tuple entry) for the SAM format must follow
* the following formatting rules: The information is given in a tab separated
Expand All @@ -225,13 +234,13 @@ class header
* * **BC:** Barcode sequence identifying the sample or library. This value is the expected barcode bases as read by
* the sequencing machine in the absence of errors. If there are several barcodes for the sample/library
* (e.g., one on each end of the template), the recommended implementation concatenates all the barcodes
* separating them with hyphens ('-').
* separating them with hyphens (`-`).
* * **CN:** Name of sequencing center producing the read.
* * **DS:** Description. UTF-8 encoding may be used.
* * **DT:** Date the run was produced (ISO8601 date or date/time).
* * **FO:** Flow order. The array of nucleotide bases that correspond to the nucleotides used for each flow of each
* read. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other
* characters. Format : /\*\|[ACMGRSVTWYHKDBN]+/
* characters. Format : `/\*\|[ACMGRSVTWYHKDBN]+/`
* * **KS:** The array of nucleotide bases that correspond to the key sequence of each read.
* * **LB:** Library.
* * **PG:** Programs used for processing the read group.
Expand All @@ -245,7 +254,7 @@ class header
std::vector<std::pair<std::string, std::string>> read_groups{};
//!\}

/*!\name [@PG] Programm information
/*!\name [PG] Programm information
* \brief You can directly edit these member variables.
* \{
*/
Expand All @@ -263,7 +272,7 @@ class header
std::vector<program_info_t> program_infos{}; //!< The list of program information.
//!\}

/*!\name [@CO] Comments
/*!\name [CO] Comments
* \brief You can directly edit these member variables.
* \{
*/
Expand All @@ -272,17 +281,16 @@ class header
};

/*!\brief Reads the SAM header.
* \tparam stream_view_type The type of the stream as a view.
* \param[in, out] stream_view The stream view to iterate over.
* \param[in] header_string The full header as a std::string_view.
*
* \throws seqan3::format_error if any unexpected character or format is encountered.
* \throws bio::map_io::format_error if any unexpected character or format is encountered.
*
* \details
*
* Reading the header format is done according to the official
* [SAM format specifications](https://samtools.github.io/hts-specs/SAMv1.pdf).
*
* The function throws a seqan3::format_error if any unknown tag was encountered. It will also fail if the format is
* The function throws a bio::map_io::format_error if any unknown tag was encountered. It will also fail if the format is
* not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual
* error.
*/
Expand Down
52 changes: 25 additions & 27 deletions include/bio/map_io/sam_flag.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,37 +35,35 @@ namespace bio::map_io
* Adapted from the [SAM specifications](https://samtools.github.io/hts-specs/SAMv1.pdf) are the following additional
* information to some flag values:
* * For each read/contig in a SAM file, it is required that one and only one line associated with the read
* has neither the seqan3::sam_flag::secondary_alignment nor the seqan3::sam_flag::supplementary_alignment flag value
* set (satisfies `FLAG & 0x900 == 0 `). This line is called the **primary alignment** of the read.
* * seqan3::sam_flag::secondary_alignment (bit `0x100`) marks the alignment not to be used in certain analyses when
* the tools in use are aware of this bit. It is typically used to flag alternative mappings when multiple mappings
* has neither the bio::map_io::sam_flag::secondary_alignment nor the bio::map_io::sam_flag::supplementary_alignment
* flag value set (satisfies `FLAG & 0x900 == 0 `). This line is called the **primary alignment** of the read.
* * bio::map_io::sam_flag::secondary_alignment (bit `0x100`) marks the alignment not to be used in certain analyses
* when the tools in use are aware of this bit. It is typically used to flag alternative mappings when multiple mappings
* are presented in a SAM.
* * seqan3::sam_flag::supplementary_alignment (bit `0x800`) indicates that the corresponding alignment line is part
* * bio::map_io::sam_flag::supplementary_alignment (bit `0x800`) indicates that the corresponding alignment line is part
* of a chimeric alignment. If the SAM/BAM file corresponds to long reads (nanopore/pacbio) this happens when
* reads are split before being aligned and the best matching part is marked as primary, while all other aligned
* parts are marked supplementary.
* * seqan3::sam_flag::unmapped (bit `0x4`) is the only reliable place to tell whether the read is unmapped.
* If seqan3::sam_flag::unmapped is set, no assumptions can be made about RNAME, POS, CIGAR, MAPQ, and
* seqan3::sam_flag::proper_pair, seqan3::sam_flag::secondary_alignment, and seqan3::sam_flag::supplementary_alignment
* * bio::map_io::sam_flag::unmapped (bit `0x4`) is the only reliable place to tell whether the read is unmapped.
* If bio::map_io::sam_flag::unmapped is set, no assumptions can be made about RNAME, POS, CIGAR, MAPQ, and
* bio::map_io::sam_flag::proper_pair, bio::map_io::sam_flag::secondary_alignment, and bio::map_io::sam_flag::supplementary_alignment
* (bits `0x2`, `0x100`, and `0x800`).
* * seqan3::sam_flag::on_reverse_strand (bit `0x10`) indicates whether the read sequence has been reverse complemented
* and the quality string is reversed. When bit seqan3::sam_flag::unmapped (`0x4`) is unset, this
* corresponds to the strand to which the segment has been mapped: seqan3::sam_flag::on_reverse_strand (bit `0x10`)
* unset indicates the forward strand, while set indicates the reverse strand. When seqan3::sam_flag::unmapped (`0x4`)
* * bio::map_io::sam_flag::on_reverse_strand (bit `0x10`) indicates whether the read sequence has been reverse complemented
* and the quality string is reversed. When bit bio::map_io::sam_flag::unmapped (`0x4`) is unset, this
* corresponds to the strand to which the segment has been mapped: bio::map_io::sam_flag::on_reverse_strand (bit `0x10`)
* unset indicates the forward strand, while set indicates the reverse strand. When bio::map_io::sam_flag::unmapped (`0x4`)
* is set, this indicates whether the unmapped read is stored in its original orientation as it came off the
* sequencing machine.
* * seqan3::sam_flag::first_in_pair and seqan3::sam_flag::second_in_pair (bits `0x40` and `0x80`) reflect the read
* ordering within each template inherent in the sequencing technology used. If seqan3::sam_flag::first_in_pair and
* seqan3::sam_flag::second_in_pair (`0x40` and `0x80`) are both set, the read is part of a linear template, but it
* * bio::map_io::sam_flag::first_in_pair and bio::map_io::sam_flag::second_in_pair (bits `0x40` and `0x80`) reflect the read
* ordering within each template inherent in the sequencing technology used. If bio::map_io::sam_flag::first_in_pair and
* bio::map_io::sam_flag::second_in_pair (`0x40` and `0x80`) are both set, the read is part of a linear template, but it
* is neither the first nor the last read. If both are unset, the index of the read in the template is unknown.
* This may happen for a non-linear template or when this information is lost during data processing.
* * If seqan3::sam_flag::paired (bit `0x1`) is unset, no assumptions can be made about seqan3::sam_flag::proper_pair,
* seqan3::sam_flag::mate_unmapped, seqan3::sam_flag::mate_on_reverse_strand, seqan3::sam_flag::first_in_pair and
* seqan3::sam_flag::second_in_pair (bits `0x2`, `0x8`, `0x20`, `0x40` and `0x80`).
* * If bio::map_io::sam_flag::paired (bit `0x1`) is unset, no assumptions can be made about bio::map_io::sam_flag::proper_pair,
* bio::map_io::sam_flag::mate_unmapped, bio::map_io::sam_flag::mate_on_reverse_strand, bio::map_io::sam_flag::first_in_pair and
* bio::map_io::sam_flag::second_in_pair (bits `0x2`, `0x8`, `0x20`, `0x40` and `0x80`).
*
* \sa https://broadinstitute.github.io/picard/explain-flags.html
*
* \remark For a complete overview, take a look at \ref io_sam_file
*/
enum class sam_flag : uint16_t
{
Expand All @@ -76,27 +74,27 @@ enum class sam_flag : uint16_t
mate_unmapped = 0x8, //!< The mate of this read is not mapped to a reference (unaligned).
on_reverse_strand = 0x10, //!< The read sequence has been reverse complemented before being mapped (aligned).
mate_on_reverse_strand = 0x20, //!< The mate sequence has been reverse complemented before being mapped (aligned).
first_in_pair = 0x40, //!< Indicates the ordering (see details in the seqan3::sam_flag description).
second_in_pair = 0x80, //!< Indicates the ordering (see details in the seqan3::sam_flag description).
first_in_pair = 0x40, //!< Indicates the ordering (see details in the bio::map_io::sam_flag description).
second_in_pair = 0x80, //!< Indicates the ordering (see details in the bio::map_io::sam_flag description).
secondary_alignment = 0x100, //!< This read alignment is an alternative (possibly suboptimal) to the primary.
failed_filter = 0x200, //!< The read alignment failed a filter, e.g. quality controls.
duplicate = 0x400, //!< The read is marked as a PCR duplicate or optical duplicate.
supplementary_alignment = 0x800 //!< This sequence is part of a split alignment and is not the primary alignment.
};

//!\cond DEV
//!\brief Enables bitwise operations for seqan3::sam_flags.
//!\ingroup io_sam_file
//!\sa seqan3::enum_bitwise_operators enables combining enum values.
/*!\brief Enables bitwise operations for bio::map_io::sam_flags.
* \ingroup io_sam_file
* \sa seqan3::enum_bitwise_operators enables combining enum values.
*/
template <>
constexpr bool add_enum_bitwise_operators<sam_flag> = true;
//!\endcond

/*!\brief Overload for the seqan3::sam_flags.
/*!\brief seqan3::debug_stream overload for the bio::map_io::::sam_flags.
* \tparam char_t Type char type of the debug_stream.
* \param stream The seqan3::debug_stream.
* \param flag The flag to print.
* \relates seqan3::debug_stream_type
*/
template <typename char_t>
inline seqan3::debug_stream_type<char_t> & operator<<(seqan3::debug_stream_type<char_t> & stream, sam_flag const flag)
Expand Down
Loading

0 comments on commit 43f205e

Please sign in to comment.