From 6b35140d9f1ba2252d217a895f6aec6fb5c4049c Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Fri, 11 Mar 2022 10:27:26 +0100 Subject: [PATCH] [FEATURE] map_io reader and SAM input. --- .../bio/detail/add_enum_bitwise_operators.hpp | 151 ++++ include/bio/format/sam.hpp | 52 ++ include/bio/format/sam_input_handler.hpp | 355 +++++++++ include/bio/map_io/all.hpp | 28 + include/bio/map_io/header.hpp | 583 ++++++++++++++ include/bio/map_io/misc.hpp | 39 + include/bio/map_io/reader.hpp | 109 +++ include/bio/map_io/reader_options.hpp | 119 +++ include/bio/map_io/sam_flag.hpp | 107 +++ include/bio/map_io/sam_tag_dictionary.hpp | 746 ++++++++++++++++++ include/bio/record.hpp | 26 +- test/unit/format/CMakeLists.txt | 1 + test/unit/format/sam_input_test.cpp | 277 +++++++ test/unit/format/sam_input_test_template.hpp | 286 +++++++ test/unit/map_io/CMakeLists.txt | 2 + test/unit/map_io/data.hpp | 34 + test/unit/map_io/header_test.cpp | 337 ++++++++ test/unit/map_io/map_io_reader_test.cpp | 286 +++++++ 18 files changed, 3525 insertions(+), 13 deletions(-) create mode 100644 include/bio/detail/add_enum_bitwise_operators.hpp create mode 100644 include/bio/format/sam.hpp create mode 100644 include/bio/format/sam_input_handler.hpp create mode 100644 include/bio/map_io/all.hpp create mode 100644 include/bio/map_io/header.hpp create mode 100644 include/bio/map_io/misc.hpp create mode 100644 include/bio/map_io/reader.hpp create mode 100644 include/bio/map_io/reader_options.hpp create mode 100644 include/bio/map_io/sam_flag.hpp create mode 100644 include/bio/map_io/sam_tag_dictionary.hpp create mode 100644 test/unit/format/sam_input_test.cpp create mode 100644 test/unit/format/sam_input_test_template.hpp create mode 100644 test/unit/map_io/CMakeLists.txt create mode 100644 test/unit/map_io/data.hpp create mode 100644 test/unit/map_io/header_test.cpp create mode 100644 test/unit/map_io/map_io_reader_test.cpp diff --git a/include/bio/detail/add_enum_bitwise_operators.hpp b/include/bio/detail/add_enum_bitwise_operators.hpp new file mode 100644 index 0000000..ea80884 --- /dev/null +++ b/include/bio/detail/add_enum_bitwise_operators.hpp @@ -0,0 +1,151 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides seqan3::add_enum_bitwise_operators. + * \author Hannes Hauswedell + */ + +#pragma once + +#include + +#include + +namespace bio::map_io +{ +/*!\interface seqan3::enum_bitwise_operators + * \brief You can expect these functions on all types that overload seqan3::add_enum_bitwise_operators. + */ +/*!\name Requirements for seqan3::enum_bitwise_operators + * \relates seqan3::enum_bitwise_operators + * \brief You can expect these member functions. + * \{ + * \fn operator&(t lhs, t rhs) + * \brief Returns the binary `&` operator of lhs and rhs. + * \param lhs First enum. + * \param rhs Second enum. + * + * \returns the binary conjunction of `lhs` and `rhs`. + * + * \fn operator|(t lhs, t rhs) + * \brief Returns the binary `|` operator of lhs and rhs. + * \param lhs First enum. + * \param rhs Second enum. + * + * \returns the binary disjunction of `lhs` and `rhs`. + * + * \fn operator^(t lhs, t rhs) + * \brief Returns the binary `^` operator of lhs and rhs. + * \param lhs First enum. + * \param rhs Second enum. + * + * \returns the binary XOR operation on `lhs` and `rhs`. + * + * \fn operator~(t lhs) + * \brief Returns the binary `~` operator of lhs. + * \param lhs First enum. + * + * \returns the binary NOT operation on `lhs`. + * + * \fn operator&=(t & lhs, t rhs) + * \brief Returns the binary `&=` operator of lhs and rhs. + * \param lhs First enum. + * \param rhs Second enum. + * + * \returns the binary AND assigment on `lhs`. + * + * \fn operator|=(t & lhs, t rhs) + * \brief Returns the binary `|=` operator of lhs and rhs. + * \param lhs First enum. + * \param rhs Second enum. + * + * \returns the binary OR assignment on `lhs`. + * + * \fn operator^=(t & lhs, t rhs) + * \brief Returns the binary `^=` operator of lhs and rhs. + * \param lhs First enum. + * \param rhs Second enum. + * + * \returns the binary XOR assignment on `lhs`. + * \} + */ + +//!\cond DEV +/*!\brief Set to true for a scoped enum to have binary operators overloaded. + * \ingroup core + * + * \details + * + * If this type trait is specialised for an enum, the binary operators `&`, `|`, `^`, `~`, `&=`, `|=`, `^=` will be + * added and behave just like for ints or unscoped enums. + * + * ### Example + * + * \include test/snippet/core/add_enum_bitwise_operators.cpp + */ +template +constexpr bool add_enum_bitwise_operators = false; + +/*!\name Binary operators for scoped enums + * \brief Perform binary operations like on ints or weak enums. These overloads are available if + * seqan3::add_enum_bitwise_operators is defined for your type. + * \ingroup core + * + * \details + * + * \see seqan3::add_enum_bitwise_operators + * \{ + */ +template +constexpr t operator&(t lhs, t rhs) noexcept requires std::is_enum_v && add_enum_bitwise_operators +{ + return static_cast(static_cast>(lhs) & static_cast>(rhs)); +} + +template +constexpr t operator|(t lhs, t rhs) noexcept requires std::is_enum_v && add_enum_bitwise_operators +{ + return static_cast(static_cast>(lhs) | static_cast>(rhs)); +} + +template +constexpr t operator^(t lhs, t rhs) noexcept requires std::is_enum_v && add_enum_bitwise_operators +{ + return static_cast(static_cast>(lhs) ^ static_cast>(rhs)); +} + +template +constexpr t operator~(t lhs) noexcept requires std::is_enum_v && add_enum_bitwise_operators +{ + return static_cast(~static_cast>(lhs)); +} + +template +constexpr t & operator&=(t & lhs, t rhs) noexcept requires std::is_enum_v && add_enum_bitwise_operators +{ + lhs = lhs & rhs; + return lhs; +} + +template +constexpr t & operator|=(t & lhs, t rhs) noexcept requires std::is_enum_v && add_enum_bitwise_operators +{ + lhs = lhs | rhs; + return lhs; +} + +template +constexpr t & operator^=(t & lhs, t rhs) noexcept requires std::is_enum_v && add_enum_bitwise_operators +{ + lhs = lhs ^ rhs; + return lhs; +} +//!\} +//!\endcond + +} // namespace bio::map_io diff --git a/include/bio/format/sam.hpp b/include/bio/format/sam.hpp new file mode 100644 index 0000000..1c84a3f --- /dev/null +++ b/include/bio/format/sam.hpp @@ -0,0 +1,52 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * brief Provides the seqan3::format_sam. + * \author Hannes Hauswedell + */ + +#pragma once + +#include +#include + +#include + +namespace bio +{ + +/*!\brief The sam format. + * \ingroup format + * + * \details + * + * This is the sam format tag. If you want to read sam files, use bio::map_io::reader, and if you want + * to write sam files, use bio::map_io::writer. + * + * ### Introduction + * + * sam is the de-facto-standard for mapping storage in bionformatics. See the + * [article on wikipedia](todo) for a an in-depth description of the format. + * + * ### Fields + * + * todo + * + * ### Implementation notes + * + * todo + * + */ +struct sam +{ + //!\brief The valid file extensions for this format; note that you can modify this value. + static inline std::vector file_extensions{{"sam"}}; +}; + +} // namespace bio diff --git a/include/bio/format/sam_input_handler.hpp b/include/bio/format/sam_input_handler.hpp new file mode 100644 index 0000000..4473c7a --- /dev/null +++ b/include/bio/format/sam_input_handler.hpp @@ -0,0 +1,355 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides the seqan3::format_input_handler. + * \author Svenja Mehringer + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace bio +{ + +/*!\brief Format input handler for the SAM format (bio::sam). + * \ingroup format + * \details + * + * ### Attention + * + * Most users should not perform I/O through input/output handlers but should instead use the respective + * readers/writers. See the overview (TODO link) for more information. + * + * ### Options + * + * The following options are considered if the respective member variable is availabele in the object passed to + * the constructor: + * + * | Member | Type | Default | Description | + * |-----------------|---------|---------|------------------------------------------------------------------| + * |`print_warnings` |`bool` | `false` | Whether to print non-critical warnings to seqan3::debug_stream | + * + */ +template <> +class format_input_handler : public format_input_handler_base> +{ +private: + /*!\name CRTP related entities + * \{ + */ + //!\brief The type of the CRTP base class. + using base_t = format_input_handler_base>; + using base_t::parse_field; + using base_t::parse_field_aux; + using base_t::stream; + + //!\brief Befriend the base class to enable CRTP. + friend base_t; + //!\} + + /*!\name Options + * \{ + */ + //!\brief Whether to print warnings or not. + bool print_warnings = true; + //!\} + + //!\brief Throw a bio::format_error with an error message with current line number in diagnostic. + [[noreturn]] void error(auto const &... messages) const + { + std::string message = "[B.I.O sam format error in line " + detail::to_string(line) + "] "; + ((message += detail::to_string(messages)), ...); + + throw format_error{message}; + } + + //!\brief Print a B.I.O warning message with current line number in diagnostic. + /* [[noreturn]] compiler says this returns something...? */ void warning(auto const &... messages) const + { + if (print_warnings) + { + seqan3::debug_stream << "[B.I.O sam format warning in line " << line << "] "; + (seqan3::debug_stream << ... << messages); + seqan3::debug_stream << std::endl; + } + } + + /*!\name Raw record handling + * \{ + */ + //!\brief The fields that this format supports [the base class accesses this type]. + using format_fields = decltype(map_io::default_field_ids); + //!\brief Type of the raw record. + using raw_record_type = + bio::record>; + + //!\brief Type of the low-level iterator. + using lowlevel_iterator = detail::plaintext_input_iterator; + + //!\brief The raw record. + raw_record_type raw_record; + //!\brief The header. + map_io::header header; + //!\brief Lowlevel stream iterator. + lowlevel_iterator file_it; + //!\brief Cache of the chromosome string. + std::string last_rname; + //!\brief Cache of the chromosome index. + int32_t last_rname_idx = -1; + //!\brief Current line number in file. + size_t line = 0; + + //!\brief Read the raw record [the base class invokes this function]. + void read_raw_record() + { + ++line; + ++file_it; + + // TODO assert number of fields + + get(raw_record) = (*file_it).fields[0]; + get(raw_record) = (*file_it).fields[1]; + get(raw_record) = (*file_it).fields[2]; + get(raw_record) = (*file_it).fields[3]; + get(raw_record) = (*file_it).fields[4]; + get(raw_record) = (*file_it).fields[5]; + get(raw_record) = (*file_it).fields[6]; + get(raw_record) = (*file_it).fields[7]; + get(raw_record) = (*file_it).fields[8]; + get(raw_record) = (*file_it).fields[9]; + get(raw_record) = (*file_it).fields[10]; + + // fields[10].end() that is guaranteed to be char* + char const * end_qual = (*file_it).fields[10].data() + (*file_it).fields[10].size() + 1 /*\t or \n*/; + // line.end() that is guaranteed to be char* + char const * end_line = (*file_it).line.data() + (*file_it).line.size(); + // SAM tags go from end of qual til end of line (possibly empty) + get(raw_record) = std::string_view{end_qual, static_cast(end_line - end_qual)}; + } + //!\} + + /*!\name Parsed record handling + * \brief This is mostly done via the defaults in the base class. + * \{ + */ + //!\brief Overload for parsing QNAME. + template + void parse_field(vtag_t const & /**/, parsed_field_t & parsed_field) + { + std::string_view raw_field = get(raw_record); + if (raw_field != "*") + parse_field_aux(raw_field, parsed_field); // default parsing + } + + //!\brief Overload for parsing FLAG. + void parse_field(vtag_t const & tag, map_io::sam_flag & parsed_field) + { + uint16_t flag_integral{}; + parse_field(tag, flag_integral); // arithmetic parsing + parsed_field = map_io::sam_flag{flag_integral}; + } + + //!\brief Parse the RNAME field. The reference names are stored in the header. + template + void parse_field(vtag_t const & /**/, parsed_field_t & parsed_field) + { + std::string_view raw_field = get(raw_record); + + if (raw_field != "*") + { + size_t rname_pos; + + if (auto it = header.rname_to_pos().find(raw_field); it == header.rname_to_pos().end()) + { // rname name was not in header, insert! + header.push_back_rname(raw_field, raw_field.size(), ""); + + rname_pos = header.rnames().size() - 1; + + warning("The reference sequence name \"", raw_field, "\" is not present in the header."); + } + else + { + rname_pos = it->second; + } + + if constexpr (std::integral>) + parsed_field = rname_pos; + else + parsed_field = header.rnames()[rname_pos]; + } + } + + /* POS, MAPQ are handled correctly by default */ + + //!\brief Overload for parsing CIGAR. + void parse_field(vtag_t const & /**/, std::vector & cigar_vector) + { + std::string_view raw_field = get(raw_record); + + if (raw_field != "*") + { + uint32_t cigar_count{}; + char const * ptr = raw_field.data(); + char const * end = ptr + raw_field.size(); + + while (ptr < end) + { + auto const res = std::from_chars(ptr, end, cigar_count); // reads number up to next chatacter + + if (res.ec != std::errc{}) // parse number + error("Corrupted cigar string encountered"); + + ptr = res.ptr + 1; // skip cigar operation character + + cigar_vector.emplace_back(cigar_count, + seqan3::assign_char_strictly_to(*res.ptr, seqan3::cigar::operation{})); + } + } + } + + //!\brief Parse the RNEXT field. + template + void parse_field(vtag_t const & /**/, parsed_field_t & parsed_field) + { + std::string_view raw_field = get(raw_record); + + if (raw_field != "*") + { + if (raw_field == "=") + raw_field = get(raw_record); + + size_t rname_pos; + + if (auto it = header.rname_to_pos().find(raw_field); it == header.rname_to_pos().end()) + { // rname name was not in header, insert! + header.push_back_rname(raw_field, raw_field.size(), ""); + + rname_pos = header.rnames().size() - 1; + + warning("The mates reference sequence name \"", raw_field, "\" is not present in the header."); + } + else + { + rname_pos = it->second; + } + + if constexpr (std::integral>) + parsed_field = rname_pos; + else + parsed_field = header.rnames()[rname_pos]; + } + } + + /* PNEXT, TLEN are handled correctly by default */ + + //!\brief Overload for parsing SEQ. + template + void parse_field(vtag_t const & /**/, parsed_field_t & parsed_field) + { + std::string_view raw_field = get(raw_record); + + if (raw_field != "*") + parse_field_aux(raw_field, parsed_field); // reading into e.g. dna4 vector + } + + //!\brief Overload for parsing QUAL. + template + void parse_field(vtag_t const & /**/, parsed_field_t & parsed_field) + { + std::string_view raw_field = get(raw_record); + + if (raw_field != "*") + parse_field_aux(raw_field, parsed_field); // reading into e.g. dna4 vector + } + + //!\brief Overload for parsing the SAM tag dictionary. + void parse_field(vtag_t const & /**/, map_io::sam_tag_dictionary & dictionary) + { + std::string_view raw_field = get(raw_record); + + if (!raw_field.empty()) + for (std::string_view const tag_field : raw_field | detail::eager_split('\t')) + dictionary.parse_and_emplace(tag_field); + } + + //!\brief Overload for parsing the private data. + void parse_field(vtag_t const & /**/, map_io::record_private_data & parsed_field) + { + parsed_field = map_io::record_private_data{.header_ptr = &header}; + } + //!\} + +public: + /*!\name Constructors, destructor and assignment. + * \{ + */ + format_input_handler() = default; //!< Defaulted. + format_input_handler(format_input_handler const &) = delete; //!< Deleted. + format_input_handler(format_input_handler &&) = default; //!< Defaulted. + ~format_input_handler() = default; //!< Defaulted. + format_input_handler & operator=(format_input_handler const &) = delete; //!< Deleted. + format_input_handler & operator=(format_input_handler &&) = default; //!< Defaulted. + + /*!\brief Construct with an options object. + * \param[in,out] str The input stream. + * \param[in] options An object with options for the input handler. + * \details + * + * The options argument is typically bio::map_io::reader_options, but any object with a subset of similarly named + * members is also accepted. See bio::format_input_handler for the supported options and defaults. + */ + template + format_input_handler(std::istream & str, options_t const & options) : base_t{str}, file_it{str, false /*no_init!*/} + { + // extract options + if constexpr (requires { (bool)options.print_warnings; }) + { + print_warnings = options.print_warnings; + } + + std::string header_string; + while (file_it != std::default_sentinel && file_it.peak() == '@') + { + ++file_it; + ++line; + header_string += file_it->line; + header_string += "\n"; + } + + header = map_io::header{}; + header.read(header_string); + } + + //!\brief Construct with only an input stream. + format_input_handler(std::istream & str) : format_input_handler{str, int{}} {} + //!\} + + //!\brief Return a reference to the header contained in the input handler. + auto const & get_header() const { return header; } +}; + +} // namespace bio diff --git a/include/bio/map_io/all.hpp b/include/bio/map_io/all.hpp new file mode 100644 index 0000000..b8c1e0a --- /dev/null +++ b/include/bio/map_io/all.hpp @@ -0,0 +1,28 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Meta-include that includes the whole Variant I/O module. + * \author Svenja Mehringer + */ + +#pragma once + +/*!\defgroup map_io Map I/O + * \ingroup bio + * \brief Reader and writer for SAM and BAM files. + * + * This module provides high-level APIs to read and write SAM and BAM files. + * + * To read files, have a look at bio::map_io::reader and to write files have a look at bio::map_io::writer. + * + */ + +/*!\namespace bio::map_io + * \brief Namespace for the Map I/O module. + */ diff --git a/include/bio/map_io/header.hpp b/include/bio/map_io/header.hpp new file mode 100644 index 0000000..cb72c90 --- /dev/null +++ b/include/bio/map_io/header.hpp @@ -0,0 +1,583 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides the bio::map_io::tag_dictionary class and auxiliaries. + * \author Hannes Hauswedell + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +namespace bio::map_io +{ + +/*!\brief The header used in mapping I/O. + * \ingroup map_io + * \details + * + * Each header line begins with the character `@` followed by one of the two-letter header record type codes + * defined in this section. In the header, each line is tab-delimited and, apart from `@CO` lines, each data field + * follows a format `TAG:VALUE` where TAG is a two-character string that defines the format and content of + * VALUE. Thus header lines match `/^@(HD|SQ|RG|PG)(\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/` or are comment lines staring + * with `@CO` followed by a tab and any character sequence. + * Within each (non-`@CO`) header line, no field tag may appear more than once and the order in which the fields + * appear is not significant. + * + * \sa https://samtools.github.io/hts-specs/SAMv1.pdf + */ +class header +{ +public: + /*!\name Constructors, destructor and assignment + * \{ + */ + //!\brief Default construction. + header() = default; //!< Defaulted. + header(header const &) = default; //!< Defaulted. + header(header &&) = default; //!< Defaulted. + ~header() = default; //!< Defaulted. + header & operator=(header const &) = default; //!< Defaulted. + header & operator=(header &&) = default; //!< Defaulted. + + /*!\brief Construct from a range of reference ids. + * \param[in] ref_ids The range over reference ids to redirect the pointer at. + */ + template // todo: restrict value type to be std::string_view constructible + header(ref_ids_type & ref_ids) : reference_names_given_on_construction{true} + { + for (auto & id : ref_ids) + reference_names.emplace_back(id); + } + + /*!\brief Construct from a rvalue range of reference ids. Ids are copied over. + * \param[in] ref_ids The range over reference ids to own. + */ + template // todo: restrict value type to be std::string_view constructible + header(ref_ids_type && ref_ids) : reference_names_given_on_construction{true} + { + for (auto & id : ref_ids) + { + owned_reference_names.push_back(id); + reference_names.emplace_back(owned_reference_names.back()); + } + } + //!\} + + //!\brief Defaulted three-way comparison. + auto operator<=>(header const &) const = default; + + inline void read(std::string_view header_string); + +private: + //!\brief In case no reference information is given on construction, the reference names are stored here. + std::deque owned_reference_names; + + //!\brief The reference sequence names. + std::vector reference_names; + + //!\brief Additional information to the reference sequence (same ordering as `reference_names`). + std::vector> reference_names_info{}; + + //!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range. + std::unordered_map reference_name_to_pos{}; + + //!\brief Whether reference sequence names were given to the header on construction. + bool reference_names_given_on_construction{false}; + + //!\brief Print a B.I.O warning message with current line number in diagnostic. + /* [[noreturn]] compiler says this returns something...? */ void warning(auto const &... messages) const + { + // if (print_warnings) + // { + // seqan3::debug_stream << "[B.I.O sam format header warning in line " << line << "] "; // todo line numbers + seqan3::debug_stream << "[B.I.O sam format header warning] "; + (seqan3::debug_stream << ... << messages); + seqan3::debug_stream << std::endl; + // } + } + +public: + /*!\name [HD] File-level meta data + * \brief You can directly edit these member variables. + * \{ + */ + std::string + format_version{}; //!< [HD VN] The file format version. Note: this is overwritten by our formats on output. + std::string sorting{}; //!< [HD SO] The sorting of the file. SAM: [unknown, unsorted, queryname, coordinate]. + std::string grouping{}; //!< [HD GO] The grouping of the file. SAM: [none, query, reference]. + std::string subsorting{}; //!< [HD SS] The sub-sorting of the file. SAM: [unknown, unsorted, queryname, + //!< coordinate]`(:[A-Za-z0-9_-]+)+`. + //!\} + + /*!\name [SQ] Reference sequence dictionary + * \brief You **CANNOT** directly edit these member variables. Please use the respective modifiers. + * \{ + */ + + /*!\brief [SQ SN] Reference sequence names + * + * \details + * + * This member function gives you access to the range of reference names. + * + * When reading a file, there are three scenarios: + * 1) Reference id information is provided on construction. In this case, no copy is made but this function + * gives you a reference to the provided range. When reading the header or the records, their reference + * information will be checked against the given input. + * 2) No reference information is provided on construction but the `@SQ` tags are present in the header. + * In this case, the reference id information is extracted from the header and this member function provides + * access to them. When reading the records, their reference id information will be checked against the header + * information. + * 3) No reference information is provided on construction an no `@SQ` tags are present in the header. + * In this case, the reference information is parsed from the records field::ref_id and stored in the header. + * This member function then provides access to the unique list of reference names encountered in the records. + */ + std::vector const & rnames() { return reference_names; } + + /*!\brief [SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information + * + * \details + * + * The reference information store the length (`@LN` tag) and + * additional information of each reference sequence in the file. The record + * must then store only the index of the reference. + * The name and length information are required if the header is provided + * and each reference sequence that is referred to in any of the records + * must be present in the dictionary, otherwise a seqan3::format_error will + * be thrown upon reading or writing a file. + * + * The additional information (2nd tuple entry) must model + * the following formatting rules: The information is given in a tab separated + * `TAG:VALUE` format, where TAG must be one of [AH, AN, AS, m5, SP, UR]. + * The following information and rules apply for each tag (taken from the SAM specs): + * + * * **AH:** Indicates that this sequence is an alternate locus. The value is the locus in the primary assembly for + * which this sequence is an alternative, in the format `chr:start-end`, `chr` (if known), or `*` (if + * unknown), where `chr` is a sequence in the primary assembly. Must not be present on sequences in the + * primary assembly. + * * **AN:** Alternative reference sequence names. A comma-separated list of alternative names that tools may use + * when referring to this reference sequence. These alternative names are not used elsewhere within the + * SAM file; in particular, they must not appear in alignment records’ RNAME or RNEXT fields. regular + * expression : `name (, name )*` where name is `[0-9A-Za-z][0-9A-Za-z*+.@ \|-]*`. + * * **AS:** Genome assembly identifier. + * * **M5:** MD5 checksum of the sequence. See Section 1.3.1 + * * **SP:** Species. + * * **UR:** URI of the sequence. This value may start with one of the standard protocols, e.g http: or ftp:. If + * it does not start with one of these protocols, it is assumed to be a file-system path + */ + std::vector> const & rnames_info() { return reference_names_info; } + + //!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range. + std::unordered_map const & rname_to_pos() { return reference_name_to_pos; } + + //!\brief Append a new rname entry with it's length and extra information and update reference_name_to_pos. + void push_back_rname(std::string_view const rname, int32_t const length, std::string_view const extra_info) + { + owned_reference_names.emplace_back(rname); + reference_names.emplace_back(owned_reference_names.back()); + reference_names_info.emplace_back(length, extra_info); + reference_name_to_pos[reference_names.back()] = reference_names.size() - 1; + } + //!\} + + /*!\name [RG] Read groups + * \brief You can directly edit these member variables. + * \{ + */ + /*!\brief The Read Group List + * + * \details + * + * The read group list stores the group id and + * additional information of each read group in the file. The record + * may store a RG tag information referencing one of the stored id's. + * The id information is required if the \@RG header line is provided. + * + * The additional information (2nd tuple entry) for the SAM format must follow + * the following formatting rules: The information is given in a tab separated + * TAG:VALUE format, where TAG must be one of [AH, AN, AS, m5, SP, UR]. + * The following information and rules apply for each tag (taken from the SAM specs): + * + * * **BC:** Barcode sequence identifying the sample or library. This value is the expected barcode bases as read by + * the sequencing machine in the absence of errors. If there are several barcodes for the sample/library + * (e.g., one on each end of the template), the recommended implementation concatenates all the barcodes + * separating them with hyphens (`-`). + * * **CN:** Name of sequencing center producing the read. + * * **DS:** Description. UTF-8 encoding may be used. + * * **DT:** Date the run was produced (ISO8601 date or date/time). + * * **FO:** Flow order. The array of nucleotide bases that correspond to the nucleotides used for each flow of each + * read. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other + * characters. Format : `/\*\|[ACMGRSVTWYHKDBN]+/` + * * **KS:** The array of nucleotide bases that correspond to the key sequence of each read. + * * **LB:** Library. + * * **PG:** Programs used for processing the read group. + * * **PI:** Predicted median insert size. + * * **PL:** Platform/technology used to produce the reads. Valid values : CAPILLARY, LS454, ILLUMINA, SOLID, + * HELICOS, IONTORRENT, ONT, and PACBIO. + * * **PM:** Platform model. Free-form text providing further details of the platform/technology used. + * * **PU:** Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier. + * * **SM:** Sample. Use pool name where a pool is being sequenced. + */ + std::vector> read_groups{}; + //!\} + + /*!\name [PG] Programm information + * \brief You can directly edit these member variables. + * \{ + */ + //!\brief Stores information of the program/tool that was used to create the file. + struct program_info_t + { + std::string id{}; //!< A unique (file scope) id. + std::string name{}; //!< The official name. + std::string command_line_call{}; //!< The command line call that produces the file. + std::string previous{}; //!< The id of the previous program if program calls were chained. + std::string description{}; //!< A description of the program and/or program call. + std::string version{}; //!< The program/tool version. + }; + + std::vector program_infos{}; //!< The list of program information. + //!\} + + /*!\name [CO] Comments + * \brief You can directly edit these member variables. + * \{ + */ + std::vector comments{}; //!< The list of comments. + //!\} +}; + +/*!\brief Reads the SAM header. + * \param[in] header_string The full header as a std::string_view. + * + * \throws bio::map_io::format_error if any unexpected character or format is encountered. + * + * \details + * + * Reading the header format is done according to the official + * [SAM format specifications](https://samtools.github.io/hts-specs/SAMv1.pdf). + * + * The function throws a bio::map_io::format_error if any unknown tag was encountered. It will also fail if the format + * is not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual + * error. + */ +void header::read(std::string_view header_string) +{ + auto stream_view = header_string | seqan3::views::single_pass_input; + + auto it = std::ranges::begin(stream_view); + auto end = std::ranges::end(stream_view); + std::array raw_tag{}; + + auto make_tag = [](uint8_t char1, uint8_t char2) constexpr + { + return static_cast(char1) | (static_cast(char2) << CHAR_BIT); + }; + + auto parse_and_make_tag = [&]() + { + raw_tag[0] = *it; + ++it; + raw_tag[1] = *it; + ++it; + return make_tag(raw_tag[0], raw_tag[1]); + }; + + auto skip_until = [&it](auto const & predicate) + { + while (!predicate(*it)) + ++it; + }; + + auto copy_tag_value_into = [&](auto & target) + { + skip_until(seqan3::is_char<':'>); + ++it; // skip : + while (!(seqan3::is_char<'\t'> || seqan3::is_char<'\n'>)(*it)) + { + target.push_back(*it); + ++it; + } + }; + + auto copy_tag_into = [&raw_tag, ©_tag_value_into](std::string & target) + { + // Some tags are not parsed individually. Instead, these are simply copied into a std::string. + // Multiple tags must be separated by a `\t`, hence we prepend a tab to the string, except the first time. + if (!target.empty()) + target.push_back('\t'); + target.push_back(raw_tag[0]); + target.push_back(raw_tag[1]); + target.push_back(':'); + copy_tag_value_into(target); + }; + + auto print_cerr_of_unspported_tag = [&it, &raw_tag](char const * const header_tag) + { std::cerr << "Unsupported SAM header tag in @" << header_tag << ": " << raw_tag[0] << raw_tag[1] << '\n'; }; + + while (it != end && seqan3::is_char<'@'>(*it)) + { + ++it; // skip @ + + switch (parse_and_make_tag()) + { + case make_tag('H', 'D'): // HD (header) tag + { + // All tags can appear in any order, VN is the only required tag + while (seqan3::is_char<'\t'>(*it)) + { + ++it; // skip tab + + switch (parse_and_make_tag()) + { + case make_tag('V', 'N'): // parse required VN (version) tag + { + copy_tag_value_into(this->format_version); + break; + } + case make_tag('S', 'O'): // SO (sorting) tag + { + copy_tag_value_into(this->sorting); + break; + } + case make_tag('S', 'S'): // SS (sub-order) tag + { + copy_tag_value_into(this->subsorting); + break; + } + case make_tag('G', 'O'): // GO (grouping) tag + { + copy_tag_value_into(this->grouping); + break; + } + default: // unsupported header tag + { + print_cerr_of_unspported_tag("HD"); + skip_until(seqan3::is_char<'\t'> || seqan3::is_char<'\n'>); + } + } + } + ++it; // skip newline + + if (format_version.empty()) + throw format_error{"The required VN tag in @HD is missing."}; + + break; + } + + case make_tag('S', 'Q'): // SQ (sequence dictionary) tag + { + std::string id; + std::tuple info{}; + bool sequence_length_was_present{}; + + // All tags can appear in any order, SN and LN are required tags + while (seqan3::is_char<'\t'>(*it)) + { + ++it; // skip tab + + switch (parse_and_make_tag()) + { + case make_tag('S', 'N'): // parse required SN (sequence name) tag + { + copy_tag_value_into(id); + break; + } + case make_tag('L', 'N'): // parse required LN (length) tag + { + sequence_length_was_present = true; + ++it; // skip : + auto res = std::from_chars(&(*it), &(header_string.back()), get<0>(info)); + if (res.ec != std::errc{}) + throw format_error{"LN tag could not be parsed correctly."}; + skip_until(seqan3::is_char<'\t'> || seqan3::is_char<'\n'>); + break; + } + default: // Any other tag + { + copy_tag_into(std::get<1>(info)); + } + } + } + ++it; // skip newline + + if (id.empty()) + throw format_error{"The required SN tag in @SQ is missing."}; + if (!sequence_length_was_present) + throw format_error{"The required LN tag in @SQ is missing."}; + + if (!reference_names_given_on_construction) + { // Reference information was not given by the user but needs to be filled from the header + push_back_rname(id, std::get<0>(info), std::get<1>(info)); + } + else + { // If reference information was given we check them against the once in the header + auto id_it = reference_name_to_pos.find(id); + + if (id_it == reference_name_to_pos.end()) + { + warning("The reference sequence name \"", + id, + "\" was present in the header but not in the " + "user provided rnames."); + + push_back_rname(id, std::get<0>(info), std::get<1>(info)); + } + else + { + assert(id_it->second < static_castsecond)>(reference_names_info.size())); + + if (std::get<0>(reference_names_info[id_it->second]) != std::get<0>(info)) + warning("Provided and header-based reference length differ for rname :\"", id, "\"."); + + reference_names_info[id_it->second] = info; // update rname information + } + } + break; + } + + case make_tag('R', 'G'): // RG (read group) tag + { + std::pair tmp{}; + + // All tags can appear in any order, SN and LN are required tags + while (seqan3::is_char<'\t'>(*it)) + { + ++it; // skip tab + + switch (parse_and_make_tag()) + { + case make_tag('I', 'D'): // parse required ID tag + { + copy_tag_value_into(get<0>(tmp)); + break; + } + default: // Any other tag + { + copy_tag_into(get<1>(tmp)); + } + } + } + ++it; // skip newline + + if (get<0>(tmp).empty()) + throw format_error{"The required ID tag in @RG is missing."}; + + read_groups.emplace_back(std::move(tmp)); + break; + } + + case make_tag('P', 'G'): // PG (program) tag + { + program_info_t tmp{}; + + // All tags can appear in any order, ID is the only required tag + while (seqan3::is_char<'\t'>(*it)) + { + ++it; // skip tab + + switch (parse_and_make_tag()) + { + case make_tag('I', 'D'): // read required ID tag + { + copy_tag_value_into(tmp.id); + break; + } + case make_tag('P', 'N'): // PN (program name) tag + { + copy_tag_value_into(tmp.name); + break; + } + case make_tag('P', 'P'): // PP (previous program) tag + { + copy_tag_value_into(tmp.previous); + break; + } + case make_tag('C', 'L'): // CL (command line) tag + { + copy_tag_value_into(tmp.command_line_call); + break; + } + case make_tag('D', 'S'): // DS (description) tag + { + copy_tag_value_into(tmp.description); + break; + } + case make_tag('V', 'N'): // VN (version) tag + { + copy_tag_value_into(tmp.version); + break; + } + default: // unsupported header tag + { + print_cerr_of_unspported_tag("PG"); + skip_until(seqan3::is_char<'\t'> || seqan3::is_char<'\n'>); + } + } + } + ++it; // skip newline + + if (tmp.id.empty()) + throw format_error{"The required ID tag in @PG is missing."}; + + this->program_infos.push_back(std::move(tmp)); + break; + } + + case make_tag('C', 'O'): // CO (comment) tag + { + ++it; // skip tab + std::string tmp{}; + while (!seqan3::is_char<'\n'>(*it)) + { + tmp.push_back(*it); + ++it; + } + ++it; // skip newline + comments.emplace_back(std::move(tmp)); + break; + } + + default: + throw format_error{std::string{"Illegal SAM header tag starting with:"} + *it}; + } + } +} + +//!\brief A datastructure that contains private data of sam IO records. +//!\ingroup var_io +struct record_private_data +{ + //!\privatesection + //!\brief Pointer to the header + header const * header_ptr = nullptr; + + friend bool operator==(record_private_data const &, record_private_data const &) = default; //!< Defaulted. +}; + +} // namespace bio::map_io diff --git a/include/bio/map_io/misc.hpp b/include/bio/map_io/misc.hpp new file mode 100644 index 0000000..07bba8b --- /dev/null +++ b/include/bio/map_io/misc.hpp @@ -0,0 +1,39 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides miscellaneous content for sequence IO. + * \author Hannes Hauswedell + */ + +#pragma once + +#include + +#include + +namespace bio::map_io +{ + +//!\brief Default fields for seqan3::map_io::reader_options. +//!\ingroup map_io +inline constexpr auto default_field_ids = vtag; + +} // namespace bio::map_io diff --git a/include/bio/map_io/reader.hpp b/include/bio/map_io/reader.hpp new file mode 100644 index 0000000..f9660af --- /dev/null +++ b/include/bio/map_io/reader.hpp @@ -0,0 +1,109 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::map_io::reader. + * \author Svenja Mehringer + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace bio::map_io +{ + +// ---------------------------------------------------------------------------- +// reader +// ---------------------------------------------------------------------------- + +/*!\brief A class for reading sam files, SAM, BAM. + * \tparam options_t A specialisation of bio::map_io::reader_options. + * \ingroup map_io + * + * \details + * + * TODO + * + * For more advanced options, see bio::map_io::reader_options. + */ +template +class reader : public reader_base> +{ +private: + //!\brief The base class. + using base_t = reader_base>; + //!\brief Inherit the format_type definition. + using format_type = typename base_t::format_type; + /* Implementation note + * format_type is "inherited" as private here to avoid appearing twice in the documentation. + * Its actual visibility is public because it is public in the base class. + */ + //!\brief Make the format handler visible. + using base_t::format_handler; + + //!\brief A pointer to the header inside the format. + bio::map_io::header const * header_ptr = nullptr; + +public: + // clang-format off + //!\copydoc bio::reader_base::reader_base(std::filesystem::path const & filename, format_type const & fmt, options_t const & opt = options_t{}) + // clang-format on + reader(std::filesystem::path const & filename, + format_type const & fmt, + reader_options const & opt = reader_options{}) : + base_t{filename, fmt, opt} + {} + + //!\overload + explicit reader(std::filesystem::path const & filename, + reader_options const & opt = reader_options{}) : + base_t{filename, opt} + {} + + // clang-format off + //!\copydoc bio::reader_base::reader_base(std::istream & str, format_type const & fmt, options_t const & opt = options_t{}) + // clang-format on + reader(std::istream & str, + format_type const & fmt, + reader_options const & opt = reader_options{}) : + base_t{str, fmt, opt} + {} + + //!\overload + template + //!\cond REQ + requires(!std::is_lvalue_reference_v) + //!\endcond + reader(temporary_stream_t && str, + format_type const & fmt, + reader_options const & opt = reader_options{}) : + base_t{std::move(str), fmt, opt} + {} + + //!\brief Access the header. + bio::map_io::header const & header() + { + if (header_ptr == nullptr) + { + // ensure that the format_handler is created + this->begin(); + + header_ptr = std::visit([](auto const & handler) { return &handler.get_header(); }, format_handler); + } + + return *header_ptr; + } +}; + +} // namespace bio::map_io diff --git a/include/bio/map_io/reader_options.hpp b/include/bio/map_io/reader_options.hpp new file mode 100644 index 0000000..bd1bfad --- /dev/null +++ b/include/bio/map_io/reader_options.hpp @@ -0,0 +1,119 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides bio::map_io::reader_options and various pre-defined field_types. + * \author Hannes Hauswedell + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace bio::map_io +{ + +/*!\name Pre-defined field types + * \brief These can be used to configure the behaviour of the bio::map_io::reader va bio::map_io::reader_options. + * \{ + */ + +/*!\brief The default field types for variant io. + *!\ingroup map_io + * + * \details + * + * These traits define a record type with minimal memory allocations for all input formats. + * It is the recommended record type when iterating ("streaming") over files that ca be any variant IO format. + * + * The "style" of the record resembles the BCF specification, i.e. contigs, FILTERs and INFO identifiers are + * represented as numbers (not strings); and the genotypes are encoded by-genotype (not by-sample). + * See bio::map_io::genotypes_bcf_style for more information on the latter. + * + * \warning Shallow types + * + * These records are not self-contained, i.e. they depend on caches and will become invalid when the reader moves to + * the next record. + * Since some elements in the record are views, it may not be possible and/or safe to change all values. + */ +template +inline constexpr auto field_types_sam = + ttag, // field::cigar, + std::string_view, // field::rnext, + int32_t, // field::pnext, + int32_t, // field::tlen, + decltype(std::string_view{} | seqan3::views::char_to), // field::seq + decltype(std::string_view{} | seqan3::views::char_to), // field::qual + sam_tag_dictionary, // field::tags + record_private_data>; // field::_private +//!\} + +/*!\brief Options that can be used to configure the behaviour of bio::map_io::reader. + * \tparam field_ids_t Type of the field_ids member (usually deduced). + * \tparam field_types_t Type of the field_types member (usually deduced). + * \tparam formats_t Type of the formats member (usually deduced). + * \ingroup map_io + * + * \details + * + * TODO describe how to easily initialise this + */ +template , + typename field_types_t = std::remove_cvref_t)>, + typename formats_t = seqan3::type_list> +struct reader_options +{ + //!\brief The fields that shall be contained in each record; a seqan3::tag over seqan3::field. + field_ids_t field_ids{}; + + /*!\brief The types corresponding to each field; a seqan3::type_tag over the types. + * + * \details + * + * See bio::map_io::reader for an overview of the supported field/type combinations. + */ + field_types_t field_types{}; + + /*!\brief The formats that input files can take; a seqan3::type_tag over the types. + * + * \details + * + * See bio::map_io::reader for an overview of the the supported formats. + */ + formats_t formats{}; + + //!\brief Whether to print non-critical file format warnings. + bool print_warnings = true; + + //!\brief Options that are passed on to the internal stream oject. + transparent_istream_options stream_options{}; + + // TODO static_assert +}; + +} // namespace bio::map_io diff --git a/include/bio/map_io/sam_flag.hpp b/include/bio/map_io/sam_flag.hpp new file mode 100644 index 0000000..c86d7f9 --- /dev/null +++ b/include/bio/map_io/sam_flag.hpp @@ -0,0 +1,107 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides helper data structures for the seqan3::sam_file_output. + * \author Svenja Mehringer + */ + +#pragma once + +#include +#include + +namespace bio::map_io +{ + +/*!\brief An enum flag that describes the properties of an aligned read (given as a SAM record). + * \ingroup io_sam_file + * \implements seqan3::enum_bitwise_operators + * \sa seqan3::enum_bitwise_operators enables combining enum values. + * + * The SAM flag are bitwise flags, which means that each value corresponds to a specific bit that is set and that they + * can be combined and tested using binary operations. + * See this [tutorial](https://www.codeproject.com/Articles/13740/The-Beginner-s-Guide-to-Using-Enum-Flags) for an + * introduction on bitwise operations on enum flags. + * + * Example: + * + * \include test/snippet/io/sam_file/sam_flags.cpp + * + * Adapted from the [SAM specifications](https://samtools.github.io/hts-specs/SAMv1.pdf) are the following additional + * information to some flag values: + * * For each read/contig in a SAM file, it is required that one and only one line associated with the read + * has neither the bio::map_io::sam_flag::secondary_alignment nor the bio::map_io::sam_flag::supplementary_alignment + * flag value set (satisfies `FLAG & 0x900 == 0 `). This line is called the **primary alignment** of the read. + * * bio::map_io::sam_flag::secondary_alignment (bit `0x100`) marks the alignment not to be used in certain analyses + * when the tools in use are aware of this bit. It is typically used to flag alternative mappings when multiple + * mappings are presented in a SAM. + * * bio::map_io::sam_flag::supplementary_alignment (bit `0x800`) indicates that the corresponding alignment line is + * part of a chimeric alignment. If the SAM/BAM file corresponds to long reads (nanopore/pacbio) this happens when reads + * are split before being aligned and the best matching part is marked as primary, while all other aligned parts are + * marked supplementary. + * * bio::map_io::sam_flag::unmapped (bit `0x4`) is the only reliable place to tell whether the read is unmapped. + * If bio::map_io::sam_flag::unmapped is set, no assumptions can be made about RNAME, POS, CIGAR, MAPQ, and + * bio::map_io::sam_flag::proper_pair, bio::map_io::sam_flag::secondary_alignment, and + * bio::map_io::sam_flag::supplementary_alignment (bits `0x2`, `0x100`, and `0x800`). + * * bio::map_io::sam_flag::on_reverse_strand (bit `0x10`) indicates whether the read sequence has been reverse + * complemented and the quality string is reversed. When bit bio::map_io::sam_flag::unmapped (`0x4`) is unset, this + * corresponds to the strand to which the segment has been mapped: bio::map_io::sam_flag::on_reverse_strand (bit + * `0x10`) unset indicates the forward strand, while set indicates the reverse strand. When + * bio::map_io::sam_flag::unmapped (`0x4`) is set, this indicates whether the unmapped read is stored in its original + * orientation as it came off the sequencing machine. + * * bio::map_io::sam_flag::first_in_pair and bio::map_io::sam_flag::second_in_pair (bits `0x40` and `0x80`) reflect the + * read ordering within each template inherent in the sequencing technology used. If + * bio::map_io::sam_flag::first_in_pair and bio::map_io::sam_flag::second_in_pair (`0x40` and `0x80`) are both set, the + * read is part of a linear template, but it is neither the first nor the last read. If both are unset, the index of the + * read in the template is unknown. This may happen for a non-linear template or when this information is lost during + * data processing. + * * If bio::map_io::sam_flag::paired (bit `0x1`) is unset, no assumptions can be made about + * bio::map_io::sam_flag::proper_pair, bio::map_io::sam_flag::mate_unmapped, + * bio::map_io::sam_flag::mate_on_reverse_strand, bio::map_io::sam_flag::first_in_pair and + * bio::map_io::sam_flag::second_in_pair (bits `0x2`, `0x8`, `0x20`, `0x40` and `0x80`). + * + * \sa https://broadinstitute.github.io/picard/explain-flags.html + */ +enum class sam_flag : uint16_t +{ + none = 0, //!< None of the flags below are set. + paired = 0x1, //!< The aligned read is paired (paired-end sequencing). + proper_pair = 0x2, //!< The two aligned reads in a pair have a proper distance between each other. + unmapped = 0x4, //!< The read is not mapped to a reference (unaligned). + mate_unmapped = 0x8, //!< The mate of this read is not mapped to a reference (unaligned). + on_reverse_strand = 0x10, //!< The read sequence has been reverse complemented before being mapped (aligned). + mate_on_reverse_strand = 0x20, //!< The mate sequence has been reverse complemented before being mapped (aligned). + first_in_pair = 0x40, //!< Indicates the ordering (see details in the bio::map_io::sam_flag description). + second_in_pair = 0x80, //!< Indicates the ordering (see details in the bio::map_io::sam_flag description). + secondary_alignment = 0x100, //!< This read alignment is an alternative (possibly suboptimal) to the primary. + failed_filter = 0x200, //!< The read alignment failed a filter, e.g. quality controls. + duplicate = 0x400, //!< The read is marked as a PCR duplicate or optical duplicate. + supplementary_alignment = 0x800 //!< This sequence is part of a split alignment and is not the primary alignment. +}; + +//!\cond DEV +/*!\brief Enables bitwise operations for bio::map_io::sam_flags. + * \ingroup io_sam_file + * \sa seqan3::enum_bitwise_operators enables combining enum values. + */ +template <> +constexpr bool add_enum_bitwise_operators = true; +//!\endcond + +/*!\brief seqan3::debug_stream overload for the bio::map_io::::sam_flags. + * \tparam char_t Type char type of the debug_stream. + * \param stream The seqan3::debug_stream. + * \param flag The flag to print. + */ +template +inline seqan3::debug_stream_type & operator<<(seqan3::debug_stream_type & stream, sam_flag const flag) +{ + return stream << static_cast(flag); +} + +} // namespace bio::map_io diff --git a/include/bio/map_io/sam_tag_dictionary.hpp b/include/bio/map_io/sam_tag_dictionary.hpp new file mode 100644 index 0000000..0f1237e --- /dev/null +++ b/include/bio/map_io/sam_tag_dictionary.hpp @@ -0,0 +1,746 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +/*!\file + * \brief Provides the bio::map_io::sam_tag_dictionary class and auxiliaries. + * \author Svenja Mehringer + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace bio::detail +{ +//!\brief std::variant of allowed types for optional tag fields of the SAM format. +//!\ingroup io_sam_file +using sam_tag_variant = std::variant, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector>; + +//!\brief Each SAM tag type char identifier. Index corresponds to the bio::detail::sam_tag_variant types. +//!\ingroup io_sam_file +char constexpr sam_tag_type_char[12] = {'A', 'i', 'f', 'Z', 'H', 'B', 'B', 'B', 'B', 'B', 'B', 'B'}; +//!\brief Each types SAM tag type extra char id. Index corresponds to the bio::detail::sam_tag_variant types. +//!\ingroup io_sam_file +char constexpr sam_tag_type_char_extra[12] = {'\0', '\0', '\0', '\0', '\0', 'c', 'C', 's', 'S', 'i', 'I', 'f'}; +} // namespace bio::detail + +namespace bio +{ + +inline namespace literals +{ + +/*!\name Other literals + * \{ + */ +/*!\brief The SAM tag literal, such that tags can be used in constant expressions. + * \ingroup io_sam_file + * \tparam char_t The char type. Usually `char`. Parameter pack `...s` must be of length 2 since SAM tags consist of two + * letters (char0 and char1). + * \relatesalso bio::map_io::sam_tag_dictionary + * \returns The unique identifier of the SAM tag computed by char0 * 128 + char1. + * + * \details + * + * A SAM tag consists of two letters, initialized via the string literal ""_tag, which delegate to its unique id. + * + * \snippet test/snippet/io/sam_file/sam_tag_dictionary/sam_tag_dictionary.cpp tag + * + * The purpose of those tags is to fill or query the bio::map_io::sam_tag_dictionary for a specific key (tag_id) and + * retrieve the corresponding value. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +template +constexpr uint16_t operator""_tag() +{ + static_assert(std::same_as, "Illegal SAM tag: Type must be char."); + constexpr std::array str{s...}; +#pragma GCC diagnostic pop + + static_assert(str.size() == 2, "Illegal SAM tag: Exactly two characters must be given."); + + char constexpr char0 = str[0]; + char constexpr char1 = str[1]; + + static_assert((seqan3::is_alpha(char0) && seqan3::is_alnum(char1)), + "Illegal SAM tag: a SAM tag must match /[A-Za-z][A-Za-z0-9]/."); + + return static_cast(char0) * 256 + static_cast(char1); +} +//!\} + +} // namespace literals + +namespace map_io +{ + +/*!\brief The generic base class. + * \ingroup io_sam_file + * + * \attention This is a pure base class that needs to be specialized in order to + * be used. + * + * ### How to specialize the type for your custom tag + * + * All known tags of the SAM specifications already have a pre-defined type. + * If you want to specify the type of your custom tag (the SAM specifications + * recommend to use X?, Y? or Z?) you need to overload the bio::map_io::sam_tag_type + * struct in the following way: (take tag "XX" as an example) + * + * \snippet test/snippet/io/sam_file/sam_tag_dictionary/sam_tag_dictionary.cpp type_overload + * + * Everything else, like the get and set functions and correct SAM output + * (XX:i:? in this case) is handled by the bio::map_io::sam_tag_dictionary. + * + * The bio::map_io::sam_tag_type is overloaded the following SAM tags: + * + * | Tag Name | SeqAn Type Implementation | + * | -------- | --------------------- | + * | "AM"_tag | int32_t | + * | "AS"_tag | int32_t | + * | "BC"_tag | std::string | + * | "BQ"_tag | std::string | + * | "BZ"_tag | std::string | + * | "CB"_tag | std::string | + * | "CC"_tag | std::string | + * | "CG"_tag | std::vector | + * | "CM"_tag | int32_t | + * | "CO"_tag | std::string | + * | "CP"_tag | int32_t | + * | "CQ"_tag | std::string | + * | "CR"_tag | std::string | + * | "CS"_tag | std::string | + * | "CT"_tag | std::string | + * | "CY"_tag | std::string | + * | "E2"_tag | std::string | + * | "FI"_tag | int32_t | + * | "FS"_tag | std::string | + * | "FZ"_tag | std::vector | + * | "H0"_tag | int32_t | + * | "H1"_tag | int32_t | + * | "H2"_tag | int32_t | + * | "HI"_tag | int32_t | + * | "IH"_tag | int32_t | + * | "LB"_tag | std::string | + * | "MC"_tag | std::string | + * | "MD"_tag | std::string | + * | "MI"_tag | std::string | + * | "MQ"_tag | int32_t | + * | "NH"_tag | int32_t | + * | "NM"_tag | int32_t | + * | "OC"_tag | std::string | + * | "OP"_tag | int32_t | + * | "OQ"_tag | std::string | + * | "OX"_tag | std::string | + * | "PG"_tag | std::string | + * | "PQ"_tag | int32_t | + * | "PT"_tag | std::string | + * | "PU"_tag | std::string | + * | "Q2"_tag | std::string | + * | "QT"_tag | std::string | + * | "QX"_tag | std::string | + * | "R2"_tag | std::string | + * | "RG"_tag | std::string | + * | "RT"_tag | std::string | + * | "RX"_tag | std::string | + * | "SA"_tag | std::string | + * | "SM"_tag | int32_t | + * | "TC"_tag | int32_t | + * | "U2"_tag | std::string | + * | "UQ"_tag | int32_t | + * + */ +template +struct sam_tag_type +{ + //!\brief The type for all unknown tags with no extra overload defaults to a std::variant. + using type = detail::sam_tag_variant; +}; + +//!\brief Short cut helper for bio::map_io::sam_tag_type::type. +//!\relates bio::map_io::sam_tag_type +template +using sam_tag_type_t = typename sam_tag_type::type; + +//!\cond +template <> +struct sam_tag_type<"AM"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"AS"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"BC"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"BQ"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"BZ"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"CB"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"CC"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"CG"_tag> +{ + using type = std::vector; +}; +template <> +struct sam_tag_type<"CM"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"CO"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"CP"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"CQ"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"CR"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"CS"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"CT"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"CY"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"E2"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"FI"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"FS"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"FZ"_tag> +{ + using type = std::vector; +}; + +// template <> struct sam_tag_type<"GC"_tag> {}; +// template <> struct sam_tag_type<"GQ"_tag> {}; +// template <> struct sam_tag_type<"GS"_tag> {}; + +template <> +struct sam_tag_type<"H0"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"H1"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"H2"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"HI"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"IH"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"LB"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"MC"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"MD"_tag> +{ + using type = std::string; +}; + +// template <> struct sam_tag_type<"MF"_tag> {}; + +template <> +struct sam_tag_type<"MI"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"MQ"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"NH"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"NM"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"OC"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"OP"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"OQ"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"OX"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"PG"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"PQ"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"PT"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"PU"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"Q2"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"QT"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"QX"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"R2"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"RG"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"RT"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"RX"_tag> +{ + using type = std::string; +}; + +// template <> struct sam_tag_type<"S2"_tag> {}; + +template <> +struct sam_tag_type<"SA"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"SM"_tag> +{ + using type = int32_t; +}; + +// template <> struct sam_tag_type<"SQ"_tag> {}; + +template <> +struct sam_tag_type<"TC"_tag> +{ + using type = int32_t; +}; +template <> +struct sam_tag_type<"U2"_tag> +{ + using type = std::string; +}; +template <> +struct sam_tag_type<"UQ"_tag> +{ + using type = int32_t; +}; +//!\endcond + +/*!\brief The SAM tag dictionary class that stores all optional SAM fields. + * \ingroup io_sam_file + * + * \details + * + * ### SAM tags + * + * A SAM tag consists of two letters, initialized via the string literal ""_tag, + * which delegates to its unique id (type uint16_t). + * Example: + * + * \snippet test/snippet/io/sam_file/sam_tag_dictionary/sam_tag_dictionary.cpp tag + * + * The purpose of those tags is to fill or query the bio::map_io::sam_tag_dictionary + * for a specific key (tag_id) and retrieve the corresponding value. + * + * ### SAM tag types + * + * Note that a SAM tag is always associated with a specific type. + * In the SAM format, the type is indicated in the second argument of the + * TAG:TYPE:VALUE field. For example "NM:i:3" specifies the NM tag of an integer + * type with value 3. + * In B.I.O, the types for + * [known](https://samtools.github.io/hts-specs/SAMtags.pdf) SAM tags + * are pre-defined by a type trait called bio::map_io::sam_tag_type. You can access + * the type via: + * + * \snippet test/snippet/io/sam_file/sam_tag_dictionary/sam_tag_dictionary.cpp tag_type_t + * + * which is the short cut for: + * + * \snippet test/snippet/io/sam_file/sam_tag_dictionary/sam_tag_dictionary.cpp tag_type + * + * The following types are allowed by the + * [SAM specifications](https://samtools.github.io/hts-specs/SAMtags.pdf): + * + * |Type | Regexp matching VALUE | Description | SeqAn Type | + * |-----|----------------------------------------|-----------------------------------------|----------------------| + * | A | [!-~] | Printable character | char | + * | i | [-+]?[0-9]+ | Signed integer | int32_t | + * | f | [-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)? | Single-precision floating number | float | + * | Z | [ !-~]* | Printable string, including space | std::string | + * | H | ([0-9A-F][0-9A-F])* | Byte array in the Hex format | std::vector | + * | B | [cCsSiIf]\(,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+ | Integer or numeric array | std::vector | + * + * For an integer or numeric array (type ‘B’), the second letter can be + * one of ‘cCsSiIf’, corresponding to type **T** = int8_t, uint8_t, int16_t, + * uint16_t, int32_t, uint32_t and float, respectively. + * + * ### Using the sam_tag_dictionary + * + * The dictionary can be accessed via the functions bio::map_io::sam_tag_dictionary::get() and + * bio::map_io::sam_tag_dictionary::set(). Every time the SAM tag you wish to query + * for must be given as a template argument to the functions. + * + * Example: + * + * \include test/snippet/io/sam_file/sam_tag_dictionary/general_usage.cpp + * + * \attention You can get any SAM_tag out of the dictionary, even if the tag is + * user defined, but note that for unknown tags the return type is an + * [std::variant](https://en.cppreference.com/w/cpp/utility/variant). + * If you want specify the return type of your custom tag, you need + * to overload the bio::map_io::sam_tag_type type trait. + * + * Unknown Tag Example: + * + * \include test/snippet/io/sam_file/sam_tag_dictionary/unknown_tag.cpp + * + * As mentioned before you can either overload the type trait bio::map_io::sam_tag_type + * for the tag "XZ" or learn more about a std::variant at + * https://en.cppreference.com/w/cpp/utility/variant. + * + * \sa bio::map_io::sam_tag_type + * \sa https://en.cppreference.com/w/cpp/utility/variant + * \sa https://samtools.github.io/hts-specs/SAMv1.pdf + * \sa https://samtools.github.io/hts-specs/SAMtags.pdf + */ +class sam_tag_dictionary : public std::map +{ +private: + //!\brief The base type. + using base_type = std::map; + +public: + //!\brief The variant type defining all valid SAM tag field types. + using variant_type = detail::sam_tag_variant; + + /*!\brief Adds an entry based on the provided string. + * + * Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter + * name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f] + * describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated + * VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf]. + * + */ + void parse_and_emplace(std::string_view const input) + { + // "[TAG]:[TYPE_ID]:[VALUE]" + assert(input.size() > 5); + uint16_t tag = static_cast(input[0]) << 8; + tag += static_cast(input[1]); + assert(input[2] == ':'); + char type_id = input[3]; + assert(input[4] == ':'); + std::string_view tag_value{input.data() + 5, input.size() - 5}; + + auto parse_arithmetic_container = [&tag_value](auto arithmetic_value) + { + using value_type = std::remove_cvref_t; + std::vector tmp_vector; + for (std::string_view const str : tag_value | detail::eager_split(',')) + { + detail::string_to_number(str, arithmetic_value); + tmp_vector.push_back(arithmetic_value); + } + return tmp_vector; + }; + + auto parse_byte_vector = [&tag_value]() + { + std::vector tmp_vector; + + if (tag_value.size() % 2 != 0) + throw format_error{"Hexadecimal tag has an uneven number of digits!"}; + + for (char const * ptr = tag_value.data(); ptr < tag_value.data() + tag_value.size(); ptr += 2) + { + uint8_t tmp_byte{}; + // std::from_chars cannot directly parse into a std::byte + std::from_chars_result res = std::from_chars(ptr, ptr + 2, tmp_byte, 16); + + if (res.ec == std::errc::invalid_argument || res.ptr != ptr + 2) + throw format_error{std::string("'") + tag_value.data() + "' could not be cast into uint8_t."}; + + if (res.ec == std::errc::result_out_of_range) + throw format_error{std::string("Casting '") + tag_value.data() + "' into uint8_t would overflow."}; + + tmp_vector.push_back(std::byte{tmp_byte}); + } + + return tmp_vector; + }; + + switch (type_id) + { + case 'A': // char + { + assert(tag_value.size() == 1); + (*this)[tag] = static_cast(tag_value[0]); + break; + } + case 'i': // int32_t + { + int32_t tmp; + detail::string_to_number(tag_value, tmp); + (*this)[tag] = tmp; + break; + } + case 'f': // float + { + float tmp; + detail::string_to_number(tag_value, tmp); + (*this)[tag] = tmp; + break; + } + case 'Z': // string + { + (*this)[tag] = std::string(tag_value.begin(), tag_value.end()); + break; + } + case 'H': + { + (*this)[tag] = parse_byte_vector(); + break; + } + case 'B': // Array. Value type depends on second char [cCsSiIf] + { + assert(input.size() > 7); + char array_value_type_id = input[5]; + tag_value = std::string_view{input.data() + 7, input.size() - 7}; + + switch (array_value_type_id) + { + case 'c': // int8_t + (*this)[tag] = parse_arithmetic_container(int8_t{}); + break; + case 'C': // uint8_t + (*this)[tag] = parse_arithmetic_container(uint8_t{}); + break; + case 's': // int16_t + (*this)[tag] = parse_arithmetic_container(int16_t{}); + break; + case 'S': // uint16_t + (*this)[tag] = parse_arithmetic_container(uint16_t{}); + break; + case 'i': // int32_t + (*this)[tag] = parse_arithmetic_container(int32_t{}); + break; + case 'I': // uint32_t + (*this)[tag] = parse_arithmetic_container(uint32_t{}); + break; + case 'f': // float + (*this)[tag] = parse_arithmetic_container(float{}); + break; + default: + throw format_error{std::string("The first character in the numerical ") + + "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id + + "' was given."}; + } + break; + } + default: + throw format_error{std::string("The second character in the numerical id of a " + "SAM tag must be one of [A,i,Z,H,B,f] but '") + + type_id + "' was given."}; + } + } + + /*!\name Getter function for the bio::map_io::sam_tag_dictionary. + *\brief Gets the value of known SAM tags by its correct type instead of the std::variant. + * \tparam tag The unique tag id of a SAM tag. + * \returns The value corresponding to the key `tag` of type bio::map_io::sam_tag_type::type. + * + * \details + * + * See the bio::map_io::sam_tag_dictionary detailed documentation below for an example. + * + * \attention This function is only available for tags that have an + * bio::map_io::sam_tag_type::type overload. See the type trait + * documentation for further details. + * \{ + */ + + //!\brief Uses std::map::operator[] for access and default initializes new keys. + template + //!\cond + requires(!std::same_as, variant_type>) + //!\endcond + auto & get() & + { + if ((*this).count(tag) == 0) + (*this)[tag] = sam_tag_type_t{}; // set correct type if tag is not set yet on + + return std::get>((*this)[tag]); + } + + //!\brief Uses std::map::operator[] for access and default initializes new keys. + template + //!\cond + requires(!std::same_as, variant_type>) + //!\endcond + auto && get() && + { + if ((*this).count(tag) == 0) + (*this)[tag] = sam_tag_type_t{}; // set correct type if tag is not set yet on + + return std::get>(std::move((*this)[tag])); + } + + //!\brief Uses std::map::at() for access and throws when the key is unknown. + //!\throws std::out_of_range if map has no key `tag`. + template + //!\cond + requires(!std::same_as, variant_type>) + //!\endcond + auto const & get() const & { return std::get>((*this).at(tag)); } + + //!\brief Uses std::map::at() for access and throws when the key is unknown. + //!\throws std::out_of_range if map has no key `tag`. + template + //!\cond + requires(!std::same_as, variant_type>) + //!\endcond + auto const && get() const && { return std::get>(std::move((*this).at(tag))); } + //!\} +}; + +} // namespace map_io +} // namespace bio diff --git a/include/bio/record.hpp b/include/bio/record.hpp index 464563c..3e81f4a 100644 --- a/include/bio/record.hpp +++ b/include/bio/record.hpp @@ -54,24 +54,24 @@ enum class field : uint64_t // Fields unique to alignment io qname = id, flag = _private + 1, //!< The alignment flag (bit information), `uint16_t` value. - /*ref_id*/ //!< The identifier of the (reference) sequence that SEQ was aligned to. + rname = ref_id, //!< The identifier of the (reference) sequence that SEQ was aligned to. /*pos*/ - mapq, //!< The mapping quality of the SEQ alignment, usually a ohred-scaled score. - cigar, //!< The cigar vector (std::vector) representing the alignment in SAM/BAM format. - next_ref_id, - next_pos, - tlen, + mapq = flag + 1, //!< The mapping quality of the SEQ alignment, usually a ohred-scaled score. + cigar, //!< The cigar vector (std::vector) representing the alignment in SAM/BAM format. + rnext, //!< The identifier of the (reference) sequence that the mate mapped to. + pnext, //!< The position where the mate mapped onto `rnext`. + tlen, //!< The observed template length. /*seq*/ /*qual*/ - optionals, //!< The optional fields in the SAM format, stored in a dictionary. + tags, //!< The optional fields in the SAM format, stored in a dictionary. /*_private*/ // Fields unique to variant io chrom = ref_id, //!< CHROM field in Var I/O. /*pos*/ /*id*/ - ref = optionals + 1, //!< REF field in Var I/O. - alt, //!< ALT field in Var I/O. + ref = tags + 1, //!< REF field in Var I/O. + alt, //!< ALT field in Var I/O. /*qual*/ filter, //!< FILTER field in Var I/O. info, //!< INFO field in Var I/O. @@ -239,17 +239,17 @@ struct record : seqan3::detail::transfer_template_args_onto_t +#include + +#include + +#include + +#include +#include + +#include "sam_input_test_template.hpp" + +// ---------------------------------------------------------------------------- +// fixture +// ---------------------------------------------------------------------------- + +template <> +struct sam_file_read : public sam_file_data +{ + // ----------------------------------------------------------------------------------------------------------------- + // formatted input + // ----------------------------------------------------------------------------------------------------------------- + + using stream_type = std::istringstream; + + std::string big_header_input{ + R"(@HD VN:1.6 SO:coordinate SS:coordinate:queryname GO:none +@PG ID:qc PN:quality_control CL:qc -f file1 DS:trim reads with low qual VN:1.0.0 +@PG ID:novoalign PN:novoalign VN:V3.02.07 CL:novoalign -d /path/hs37d5.ndx -f /path/file.fastq.gz PP:qc +@SQ SN:ref LN:249250621 +@SQ SN:ref2 LN:243199373 AS:hs37d5 +@RG ID:U0a_A2_L1 PL:illumina PU:1 LB:1 SM:NA12878 +@RG ID:U0a_A2_L2 PL:illumina SM:NA12878 PU:1 LB:1 +@CO Tralalalalalala this is a comment +)"}; + + std::string simple_three_reads_input{ + R"(@HD VN:1.6 +@SQ SN:ref LN:34 +read1 41 ref 1 61 1S1M1D1M1I ref 10 300 ACGT !##$ AS:i:2 NM:i:7 +read2 42 ref 2 62 1H7M1D1M1S2H ref 10 300 AGGCTGNAG !##$&'()* xy:B:S,3,4,5 +read3 43 ref 3 63 1S1M1P1M1I1M1I1D1M1S ref 10 300 GGAGTATA !!*+,-./ +)"}; + + std::string verbose_reads_input{ + // "@HD\tVN:1.6\n@SQ\tSN:ref\tLN:34\n" // todo: check why adding this segfaults, adding this removes warnings + "read1\t41\tref\t1\t61\t1S1M1D1M1I\t=\t10\t300\tACGT\t!##$\taa:A:c" + "\tNM:i:-7" + "\tAS:i:2" + "\tff:f:3.1" + "\tzz:Z:str" + "\tCC:i:300" + "\tcc:i:-300\n" + "read2\t42\tref\t2\t62\t1H7M1D1M1S2H\tref\t10\t300\tAGGCTGNAG\t!##$&'()*\tbc:B:c,-3" + "\tbC:B:C,3,200" + "\tbs:B:s,-3,200,-300" + "\tbS:B:S,300,40,500" + "\tbi:B:i,-3,200,-66000" + "\tbI:B:I,294967296" + "\tbf:B:f,3.5,0.1,43.8" + "\tbH:H:1AE301\n" + "read3\t43\tref\t3\t63\t1S1M1P1M1I1M1I1D1M1S\tref\t10\t300\tGGAGTATA\t!!*+,-./\n"}; + + std::string empty_input{"*\t0\t*\t0\t0\t*\t*\t0\t0\t*\t*\n"}; + + std::string empty_cigar{"read1\t41\tref\t1\t61\t*\tref\t10\t300\tACGT\t!##$\n"}; + + std::string unknown_ref{ + "read1\t41\traf\t1\t61\t1S1M1D1M1I\t=\t10\t300\tACGT\t!##$\taa:A:c\tAS:i:2\tff:f:3.1\tzz:Z:str\n"}; + + std::string unknown_ref_header{"@HD\tVN:1.6\n@SQ\tSN:ref\tLN:34\n*\t0\tunknown_ref\t1\t0\t4M\t*\t0\t0\tAAAA\t*\n"}; + + // ----------------------------------------------------------------------------------------------------------------- + // formatted output + // ----------------------------------------------------------------------------------------------------------------- + + std::string simple_three_reads_output{// compared to simple_three_reads_input this has no hard clipping + R"(@HD VN:1.6 +@SQ SN:ref LN:34 +read1 41 ref 1 61 1S1M1D1M1I ref 10 300 ACGT !##$ AS:i:2 NM:i:7 +read2 42 ref 2 62 7M1D1M1S ref 10 300 AGGCTGNAG !##$&'()* xy:B:S,3,4,5 +read3 43 ref 3 63 1S1M1P1M1I1M1I1D1M1S ref 10 300 GGAGTATA !!*+,-./ +)"}; + + std::string verbose_output{ + R"(@HD VN:1.6 SO:unknown GO:none +@SQ SN:ref LN:34 AN:other_name +@RG ID:group1 DS:more info +@PG ID:prog1 PN:cool_program CL:./prog1 PP:a DS:b VN:c +@CO This is a comment. +read1 41 ref 1 61 1S1M1D1M1I ref 10 300 ACGT !##$ AS:i:2 CC:i:300 NM:i:-7 aa:A:c cc:i:-300 ff:f:3.1 zz:Z:str +read2 42 ref 2 62 7M1D1M1S ref 10 300 AGGCTGNAG !##$&'()* bC:B:C,3,200 bI:B:I,294967296 bS:B:S,300,40,500 bc:B:c,-3 bf:B:f,3.5,0.1,43.8 bi:B:i,-3,200,-66000 bs:B:s,-3,200,-300 +read3 43 ref 3 63 1S1M1P1M1I1M1I1D1M1S ref 10 300 GGAGTATA !!*+,-./ +)"}; + + std::string special_output{ + R"(@HD VN:1.6 +@SQ SN:ref LN:34 +read1 41 * 1 61 1S1M1D1M1I * 0 0 ACGT !##$ +)"}; + + std::string wrong_hexadecimal_tag{ + R"(read1 41 ref 1 61 1S1M1D1M1I ref 10 300 ACGT !##$ bH:H:1AE30 +)"}; +}; + +// --------------------------------------------------------------------------------------------------------------------- +// parametrized tests +// --------------------------------------------------------------------------------------------------------------------- + +INSTANTIATE_TYPED_TEST_SUITE_P(sam, sam_file_read, seqan3::format_sam, ); +// INSTANTIATE_TYPED_TEST_SUITE_P(sam, sam_file_write, seqan3::format_sam, ); + +// --------------------------------------------------------------------------------------------------------------------- +// SAM specifics +// --------------------------------------------------------------------------------------------------------------------- + +struct sam_input_test : public sam_file_data +{}; + +TEST_F(sam_input_test, no_hd_line_in_header) +{ + // the header line (@HD) is optional + std::istringstream istream{std::string{"@SQ\tSN:ref\tLN:34\nread1\t41\tref\t1\t61\t*\tref\t10\t300\tACGT\t!##$\n"}}; + + using record_t = bio::record)>, + std::remove_cvref_t)>>; + bio::format_input_handler input_handler{istream, default_options /*defined in sam_input_test_template*/}; + record_t rec; + + EXPECT_NO_THROW(input_handler.parse_next_record_into(rec)); + + EXPECT_EQ(rec.id(), std::string{"read1"}); +} + +TEST_F(sam_input_test, windows_file) +{ + std::istringstream istream(std::string{"read1\t41\tref\t1\t61\t*\tref\t10\t300\tACGT\t!##$\r\n"}); + + using record_t = bio::record)>, + std::remove_cvref_t)>>; + bio::format_input_handler input_handler{istream, default_options /*defined in sam_input_test_template*/}; + record_t rec; + + EXPECT_NO_THROW(input_handler.parse_next_record_into(rec)); + + EXPECT_EQ(rec.id(), std::string{"read1"}); +} + +TEST_F(sam_input_test, seqan3_issue2195) +{ // see issue https://github.com/seqan/seqan3/issues/2195 + using seqan3::operator""_phred42; + + auto ftype = bio::ttag)>; + using record_t = bio::record)>, + std::remove_cvref_t>; + record_t rec; + + { + std::istringstream istream{"*r1\t4\t1\t10\t0\t5M\t=\t136097\t-121\tACTGA\t*9<9;\tNM:i:1\tMQ:i:0\n"}; + bio::format_input_handler input_handler{istream, default_options}; + EXPECT_NO_THROW(input_handler.parse_next_record_into(rec)); + + EXPECT_RANGE_EQ(rec.id(), std::string{"*r1"}); + EXPECT_RANGE_EQ(rec.qual(), "*9<9;"_phred42); + } + + { + std::istringstream istream{"*\t4\t1\t10\t0\t2M\t=\t136097\t-121\tAC\t*1\tNM:i:1\tMQ:i:0\n"}; + bio::format_input_handler input_handler{istream, default_options}; + EXPECT_NO_THROW(input_handler.parse_next_record_into(rec)); + + EXPECT_RANGE_EQ(rec.id(), std::string{""}); + EXPECT_RANGE_EQ(rec.qual(), "*1"_phred42); + } +} + +struct sam_input_format_error : public sam_file_data +{}; + +TEST_F(sam_input_format_error, illegal_character_in_seq) +{ + std::istringstream istream(std::string("*\t0\t*\t0\t0\t*\t*\t0\t0\tAC!T\t*\n")); + + using record_t = bio::record)>, + std::remove_cvref_t)>>; + bio::format_input_handler input_handler{istream, default_options /*defined in sam_input_test_template*/}; + record_t rec; + + EXPECT_THROW(input_handler.parse_next_record_into(rec), seqan3::invalid_char_assignment); +} + +TEST_F(sam_input_format_error, invalid_arithmetic_value) +{ + using record_t = bio::record)>, + std::remove_cvref_t)>>; + record_t rec; + + // invalid value + { + std::istringstream istream(std::string("*\t0\t*\t1abc\t0\t*\t*\t0\t0\t*\t*\n")); + bio::format_input_handler input_handler{istream, default_options}; + EXPECT_THROW(input_handler.parse_next_record_into(rec), std::runtime_error); + } + // overflow error + { + std::istringstream istream = std::istringstream(std::string("*\t0\t*\t2147483650\t0\t*\t*\t0\t0\t*\t*\n")); + bio::format_input_handler input_handler{istream, default_options}; + EXPECT_THROW(input_handler.parse_next_record_into(rec), std::runtime_error); + } + // negative value as pos - do we want this semantic check? + // { + // std::istringstream istream = std::istringstream(std::string("*\t0\t*\t-3\t0\t*\t*\t0\t0\t*\t*\n")); + // bio::format_input_handler input_handler{istream, default_options}; + // EXPECT_THROW(input_handler.parse_next_record_into(rec), std::runtime_error); + // } + + // negative value as mate mapping position - do we want this semantic check? + // { + // std::istringstream istream = std::istringstream(std::string("*\t0\t*\t0\t0\t*\t*\t-3\t0\t*\t*\n")); + // bio::format_input_handler input_handler{istream, default_options}; + // EXPECT_THROW(input_handler.parse_next_record_into(rec), std::runtime_error); + // } +} + +TEST_F(sam_input_format_error, invalid_cigar) +{ + using record_t = bio::record)>, + std::remove_cvref_t>)>>; + record_t rec; + + // unkown operation + { + std::istringstream istream(std::string("*\t0\t*\t0\t0\t5Z\t*\t0\t0\t*\t*\n")); + bio::format_input_handler input_handler{istream, default_options}; + EXPECT_THROW(input_handler.parse_next_record_into(rec), seqan3::invalid_char_assignment); + } + // negative number as operation count + { + std::istringstream istream = std::istringstream(std::string("*\t0\t*\t0\t0\t-5M\t*\t0\t0\t*\t*\n")); + bio::format_input_handler input_handler{istream, default_options}; + EXPECT_THROW(input_handler.parse_next_record_into(rec), bio::format_error); + } + // negative number as operation count - long cigar + { + std::istringstream istream = std::istringstream(std::string("*\t0\t*\t0\t0\t3S4M1I-5M2D2M\t*\t0\t0\t*\t*\n")); + bio::format_input_handler input_handler{istream, default_options}; + EXPECT_THROW(input_handler.parse_next_record_into(rec), bio::format_error); + } +} + +TEST_F(sam_input_format_error, invalid_sam_tag_format) +{ + using record_t = bio::record)>, + std::remove_cvref_t)>>; + record_t rec; + + // type identifier is wrong + { + std::istringstream istream(std::string("*\t0\t*\t0\t0\t*\t*\t0\t0\t*\t*\tNM:X:3\n")); + bio::format_input_handler input_handler{istream, default_options}; + EXPECT_THROW(input_handler.parse_next_record_into(rec), bio::format_error); + } + // Array subtype identifier is wrong + { + std::istringstream istream = std::istringstream(std::string("*\t0\t*\t0\t0\t*\t*\t0\t0\t*\t*\tNM:B:x3,4\n")); + bio::format_input_handler input_handler{istream, default_options}; + EXPECT_THROW(input_handler.parse_next_record_into(rec), bio::format_error); + } +} diff --git a/test/unit/format/sam_input_test_template.hpp b/test/unit/format/sam_input_test_template.hpp new file mode 100644 index 0000000..d82ec47 --- /dev/null +++ b/test/unit/format/sam_input_test_template.hpp @@ -0,0 +1,286 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md +// ----------------------------------------------------------------------------------------------------- + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using seqan3::operator""_cigar_operation; +using seqan3::operator""_dna5; +using seqan3::operator""_phred42; +using seqan3::operator""_tag; + +// global variables for reuse +bio::map_io::reader_options default_options{}; +constexpr std::string_view warning_prefix{"[B.I.O sam format warning in line "}; + +struct sam_file_data : public ::testing::Test +{ + sam_file_data() + { + ref_sequences = std::vector{ref_seq}; + ref_ids = std::vector{ref_id}; + header = seqan3::sam_file_header{ref_ids}; + header.ref_id_info.emplace_back(ref_seq.size(), ""); + header.ref_dict[header.ref_ids()[0]] = 0; // set up header which is otherwise done on file level + } + + std::vector seqs{"ACGT"_dna5, "AGGCTGNAG"_dna5, "GGAGTATA"_dna5}; + + std::vector ids{"read1", "read2", "read3"}; + + std::vector> quals{ + {"!##$"_phred42}, + {"!##$&'()*"_phred42}, + {"!!*+,-./"_phred42}, + }; + + std::vector offsets{1, 0, 1}; + + seqan3::dna5_vector ref_seq = "ACTGATCGAGAGGATCTAGAGGAGATCGTAGGAC"_dna5; + + std::string ref_id = "ref"; + + std::vector positions // 1-based in b.i.o. 0-based in seqan3 + {1, 2, 3}; + + // clang-format off + std::vector> cigars + { + // cigar read1 + {{1, 'S'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'I'_cigar_operation}}, + // cigar read2 + {{1, 'H'_cigar_operation}, + {7, 'M'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'S'_cigar_operation}, + {2, 'H'_cigar_operation}}, + // cigar read3 + {{1, 'S'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'P'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'I'_cigar_operation}, + {1, 'D'_cigar_operation}, + {1, 'M'_cigar_operation}, + {1, 'S'_cigar_operation}} + }; + // clang-format on + + std::vector flags{bio::map_io::sam_flag{41u}, + bio::map_io::sam_flag{42u}, + bio::map_io::sam_flag{43u}}; + + std::vector mapqs{61u, 62u, 63u}; + + std::vector, std::optional, int32_t>> mates // position 1-based + { + {0, 10, 300}, + {0, 10, 300}, + {0, 10, 300} + }; + + std::vector tag_dicts{bio::map_io::sam_tag_dictionary{}, + bio::map_io::sam_tag_dictionary{}, + bio::map_io::sam_tag_dictionary{}}; + + std::vector ref_sequences{}; + std::vector ref_ids{}; + seqan3::sam_file_header> header{}; +}; + +template +struct sam_file_read : public sam_file_data +{}; + +TYPED_TEST_SUITE_P(sam_file_read); + +// ---------------------------------------------------------------------------- +// sam_file_read +// ---------------------------------------------------------------------------- + +TYPED_TEST_P(sam_file_read, full_data_set) +{ + // prepare tag dictionary + this->tag_dicts[0]["NM"_tag] = -7; + this->tag_dicts[0]["AS"_tag] = 2; + this->tag_dicts[0]["CC"_tag] = 300; + this->tag_dicts[0]["cc"_tag] = -300; + this->tag_dicts[0]["aa"_tag] = 'c'; + this->tag_dicts[0]["ff"_tag] = 3.1f; + this->tag_dicts[0]["zz"_tag] = "str"; + this->tag_dicts[1]["bc"_tag] = std::vector{-3}; + this->tag_dicts[1]["bC"_tag] = std::vector{3u, 200u}; + this->tag_dicts[1]["bs"_tag] = std::vector{-3, 200, -300}; + this->tag_dicts[1]["bS"_tag] = std::vector{300u, 40u, 500u}; + this->tag_dicts[1]["bi"_tag] = std::vector{-3, 200, -66000}; + this->tag_dicts[1]["bI"_tag] = std::vector{294967296u}; + this->tag_dicts[1]["bf"_tag] = std::vector{3.5f, 0.1f, 43.8f}; + this->tag_dicts[1]["bH"_tag] = std::vector{std::byte{0x1A}, std::byte{0xE3}, std::byte{0x01}}; + + // read data + typename TestFixture::stream_type istream{this->verbose_reads_input}; + using record_t = bio::record; + bio::format_input_handler input_handler{istream, default_options}; + record_t rec; + + for (unsigned i = 0; i < 3; ++i) + { + input_handler.parse_next_record_into(rec); + + EXPECT_EQ(rec.id(), this->ids[i]) << "failed at i = " << i << std::endl; + EXPECT_EQ(rec.flag(), this->flags[i]) << "failed at i = " << i << std::endl; + EXPECT_EQ(rec.rname(), this->ref_id) << "failed at i = " << i << std::endl; + EXPECT_EQ(rec.pos(), this->positions[i]) << "failed at i = " << i << std::endl; + EXPECT_EQ(rec.mapq(), this->mapqs[i]) << "failed at i = " << i << std::endl; + EXPECT_RANGE_EQ(rec.cigar(), this->cigars[i]); + EXPECT_EQ(rec.rnext(), this->ref_id) << "failed at i = " << i << std::endl; + EXPECT_EQ(rec.pnext(), std::get<1>(this->mates[i])) << "failed at i = " << i << std::endl; + EXPECT_EQ(rec.tlen(), std::get<2>(this->mates[i])) << "failed at i = " << i << std::endl; + EXPECT_RANGE_EQ(rec.seq(), this->seqs[i]); + EXPECT_RANGE_EQ(rec.qual(), this->quals[i]); + EXPECT_EQ(rec.tags(), this->tag_dicts[i]) << "failed at i = " << i << std::endl; + } +} + +TYPED_TEST_P(sam_file_read, all_missing_data) +{ + typename TestFixture::stream_type istream{this->empty_input}; + + using record_t = bio::record; + bio::format_input_handler input_handler{istream, default_options}; + record_t rec; + + input_handler.parse_next_record_into(rec); + + EXPECT_TRUE(rec.id().empty()); + EXPECT_TRUE(rec.rname().empty()); + EXPECT_TRUE(rec.rnext().empty()); + EXPECT_TRUE(rec.cigar().empty()); + EXPECT_TRUE(rec.seq().empty()); + EXPECT_TRUE(rec.qual().empty()); + EXPECT_TRUE(rec.tags().empty()); + + EXPECT_EQ(rec.flag(), bio::map_io::sam_flag{0u}); + EXPECT_EQ(rec.pos(), 0); + EXPECT_EQ(rec.pnext(), 0); + EXPECT_EQ(rec.mapq(), 0u); + EXPECT_EQ(rec.tlen(), 0); +} + +TYPED_TEST_P(sam_file_read, select_fields) +{ + typename TestFixture::stream_type istream{this->verbose_reads_input}; + + constexpr auto fid = bio::vtag; + constexpr auto ftype = bio::ttag; + + using record_t = bio::record, std::remove_cvref_t>; + bio::format_input_handler input_handler{istream, default_options}; + record_t rec; + + for (unsigned i = 0; i < 3; ++i) + { + input_handler.parse_next_record_into(rec); + EXPECT_EQ(rec.rname(), this->ref_id); + EXPECT_EQ(rec.pos(), this->positions[i]); + } +} + +TYPED_TEST_P(sam_file_read, warning_rname_not_in_header) +{ + typename TestFixture::stream_type istream{this->verbose_reads_input}; + + constexpr auto fid = bio::vtag; + constexpr auto ftype = bio::ttag; + + using record_t = bio::record, std::remove_cvref_t>; + bio::format_input_handler input_handler{istream, default_options}; + record_t rec; + + std::string warning_msg{warning_prefix.data() + std::string{"1] The reference sequence name \""} + this->ref_id + + "\" is not present in the header.\n"}; + + for (unsigned i = 0; i < 3; ++i) + { + testing::internal::CaptureStderr(); + input_handler.parse_next_record_into(rec); + EXPECT_EQ(testing::internal::GetCapturedStderr(), ((i == 0) ? warning_msg : std::string{})); + + EXPECT_EQ(rec.rname(), this->ref_id); + EXPECT_EQ(rec.pos(), this->positions[i]); + } +} + +TYPED_TEST_P(sam_file_read, warning_rnext_not_in_header) +{ + typename TestFixture::stream_type istream{this->verbose_reads_input}; + + constexpr auto fid = bio::vtag; + constexpr auto ftype = bio::ttag; + + using record_t = bio::record, std::remove_cvref_t>; + bio::format_input_handler input_handler{istream, default_options}; + record_t rec; + + std::string warning_msg{warning_prefix.data() + std::string{"1] The mates reference sequence name \""} + + this->ref_id + "\" is not present in the header.\n"}; + + for (unsigned i = 0; i < 3; ++i) + { + testing::internal::CaptureStderr(); + input_handler.parse_next_record_into(rec); + EXPECT_EQ(testing::internal::GetCapturedStderr(), ((i == 0) ? warning_msg : std::string{})); + + EXPECT_EQ(rec.rnext(), this->ref_id); + EXPECT_EQ(rec.pnext(), std::get<1>(this->mates[i])); + } +} + +TYPED_TEST_P(sam_file_read, format_error_uneven_hexadecimal_tag) +{ + typename TestFixture::stream_type istream{this->wrong_hexadecimal_tag}; + + constexpr auto fid = bio::vtag; + constexpr auto ftype = bio::ttag; + + using record_t = bio::record, std::remove_cvref_t>; + bio::format_input_handler input_handler{istream, default_options}; + record_t rec; + + EXPECT_THROW(input_handler.parse_next_record_into(rec), bio::format_error); +} + +REGISTER_TYPED_TEST_SUITE_P(sam_file_read, + full_data_set, + all_missing_data, + select_fields, + warning_rname_not_in_header, + warning_rnext_not_in_header, + format_error_uneven_hexadecimal_tag); diff --git a/test/unit/map_io/CMakeLists.txt b/test/unit/map_io/CMakeLists.txt new file mode 100644 index 0000000..19437d1 --- /dev/null +++ b/test/unit/map_io/CMakeLists.txt @@ -0,0 +1,2 @@ +bio_test(header_test.cpp) +bio_test(map_io_reader_test.cpp) diff --git a/test/unit/map_io/data.hpp b/test/unit/map_io/data.hpp new file mode 100644 index 0000000..979b3f5 --- /dev/null +++ b/test/unit/map_io/data.hpp @@ -0,0 +1,34 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE +// ----------------------------------------------------------------------------------------------------- + +#include + +inline constexpr std::string_view input = + R"(@HD VN:1.6 +@SQ SN:ref LN:34 +read1 41 ref 1 61 1S1M1D1M1I ref 10 300 ACGT !##$ AS:i:2 NM:i:7 +read2 42 ref 2 62 1H7M1D1M1S2H ref 10 300 AGGCTGNAG !##$&'()* xy:B:S,3,4,5 +read3 43 ref 3 63 1S1M1P1M1I1M1I1D1M1S ref 10 300 GGAGTATA !!*+,-./ +)"; + +inline constexpr std::string_view input_bgzipped{ + "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00" + "\xbc\x00\x55\x8e\xcb\x0a\xc2\x30\x14\x44\xd7\xd3\xbf\x28\x15\x1f" + "\xb5\xd6\xde\x24\xa6\x90\x55\x63\x0b\xa9\x60\x83\x92\xe0\x5e\xb0" + "\x82\xdb\xae\xf4\xef\x6d\xd3\x8d\x2e\x86\x0b\xc3\x3d\x87\xa9\xda" + "\x06\x37\xab\x28\x97\x51\xe5\xae\x70\x56\x0d\xfd\x13\x67\xab\xb8" + "\x88\x86\xfe\xfe\x20\x08\xc2\x54\x11\x24\x81\x1c\x75\xd4\x8c\x39" + "\xcd\x5d\x01\x5e\x14\xd0\xb5\xf1\x88\x93\x64\x01\xed\xd4\x4b\x31" + "\xd8\x6e\x3c\x65\xe0\x19\x04\x0b\xbf\x0c\x92\x81\xda\x72\xe6\x1d" + "\x6b\xff\x0c\xc6\xd4\xde\x58\x6d\x82\x66\xb9\x5a\x6f\x52\xbc\x3f" + "\xea\xa8\x5c\xc6\x33\x91\x1d\x82\x8a\x43\xf0\x00\x71\x48\x3e\x4f" + "\xb9\x4c\x53\x42\x82\xf4\x57\x69\x8c\x36\x5e\x7b\x8d\x38\x4e\xb7" + "\xd9\x2e\xdf\x47\x5f\x01\x82\x81\x27\xeb\x00\x00\x00\x1f\x8b\x08" + "\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00\x1b\x00\x03" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00", + 217}; diff --git a/test/unit/map_io/header_test.cpp b/test/unit/map_io/header_test.cpp new file mode 100644 index 0000000..c1b43fb --- /dev/null +++ b/test/unit/map_io/header_test.cpp @@ -0,0 +1,337 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE +// ----------------------------------------------------------------------------------------------------- + +#include + +#include + +TEST(map_io_header, ctors) +{ + // todo +} + +TEST(map_io_header, read_full_header) +{ + std::string big_header_input{ + R"(@HD VN:1.6 SO:coordinate SS:coordinate:queryname GO:none +@PG ID:qc PN:quality_control CL:qc -f file1 DS:trim reads with low qual VN:1.0.0 +@PG ID:novoalign PN:novoalign VN:V3.02.07 CL:novoalign -d /path/hs37d5.ndx -f /path/file.fastq.gz PP:qc +@SQ SN:ref LN:249250621 +@SQ SN:ref2 LN:243199373 AS:hs37d5 +@RG ID:U0a_A2_L1 PL:illumina PU:1 LB:1 SM:NA12878 +@RG ID:U0a_A2_L2 PL:illumina SM:NA12878 PU:1 LB:1 +@CO Tralalalalalala this is a comment +)"}; + + bio::map_io::header header{}; + EXPECT_NO_THROW(header.read(big_header_input)); + + EXPECT_EQ(header.format_version, "1.6"); + EXPECT_EQ(header.sorting, "coordinate"); + EXPECT_EQ(header.subsorting, "coordinate:queryname"); + EXPECT_EQ(header.grouping, "none"); + + EXPECT_EQ(header.program_infos[0].id, "qc"); + EXPECT_EQ(header.program_infos[0].name, "quality_control"); + EXPECT_EQ(header.program_infos[0].version, "1.0.0"); + EXPECT_EQ(header.program_infos[0].description, "trim reads with low qual"); + EXPECT_EQ(header.program_infos[0].previous, ""); + EXPECT_EQ(header.program_infos[0].command_line_call, "qc -f file1"); + EXPECT_EQ(header.program_infos[1].id, "novoalign"); + EXPECT_EQ(header.program_infos[1].name, "novoalign"); + EXPECT_EQ(header.program_infos[1].version, "V3.02.07"); + EXPECT_EQ(header.program_infos[1].description, ""); + EXPECT_EQ(header.program_infos[1].previous, "qc"); + EXPECT_EQ(header.program_infos[1].command_line_call, "novoalign -d /path/hs37d5.ndx -f /path/file.fastq.gz"); + + std::string id1{"ref"}; + std::string id2{"ref2"}; + + EXPECT_EQ(header.rnames_info().size(), 2u); + EXPECT_EQ(header.rnames_info()[(header.rname_to_pos().at(id1))], + (std::tuple{249250621u, ""})); + EXPECT_EQ(header.rnames_info()[(header.rname_to_pos().at(id2))], + (std::tuple{243199373u, "AS:hs37d5"})); + + EXPECT_EQ(header.read_groups[0], + (std::pair{"U0a_A2_L1", "PL:illumina\tPU:1\tLB:1\tSM:NA12878"})); + EXPECT_EQ(header.read_groups[1], + (std::pair{"U0a_A2_L2", "PL:illumina\tSM:NA12878\tPU:1\tLB:1"})); + + EXPECT_EQ(header.comments[0], "Tralalalalalala this is a comment"); +} + +TEST(map_io_header, independent_order) +{ + // order of tags does not matter + std::string header_str{ + "@HD\tGO:none\tSO:coordinate\tVN:1.6\tSS:coordinate:queryname\n" + "@CO\tTralalalalalala this is a comment\n" + "@PG\tPN:novoalign\tPP:qc\tID:novoalign\tVN:V3.02.07\tCL:novoalign -d /hs37d5.ndx -f /file.fastq.gz\n" + "@SQ\tAS:hs37d5\tSN:ref2\tLN:243199373\n" + "@RG\tLB:1\tSM:NA12878\tPL:illumina\tPU:1\tID:U0a_A2_L1\n"}; + + bio::map_io::header header{}; + EXPECT_NO_THROW(header.read(header_str)); + + EXPECT_EQ(header.format_version, "1.6"); + EXPECT_EQ(header.sorting, "coordinate"); + EXPECT_EQ(header.subsorting, "coordinate:queryname"); + EXPECT_EQ(header.grouping, "none"); + + EXPECT_EQ(header.program_infos[0].id, "novoalign"); + EXPECT_EQ(header.program_infos[0].name, "novoalign"); + EXPECT_EQ(header.program_infos[0].version, "V3.02.07"); + EXPECT_EQ(header.program_infos[0].description, ""); + EXPECT_EQ(header.program_infos[0].previous, "qc"); + EXPECT_EQ(header.program_infos[0].command_line_call, "novoalign -d /hs37d5.ndx -f /file.fastq.gz"); + + std::string id2{"ref2"}; + EXPECT_EQ(header.rnames_info().size(), 1u); + EXPECT_EQ(header.rnames_info()[(header.rname_to_pos().at(id2))], + (std::tuple{243199373u, "AS:hs37d5"})); + + EXPECT_EQ(header.read_groups[0], + (std::pair{"U0a_A2_L1", "LB:1\tSM:NA12878\tPL:illumina\tPU:1"})); + + EXPECT_EQ(header.comments[0], "Tralalalalalala this is a comment"); +} + +TEST(map_io_header, issue2423) +{ + // former seqan3 issue https://github.com/seqan/seqan3/pull/2423 + std::string many_refs_input{[]() + { + std::string result{"@HD\tVN:1.6\n"}; + for (size_t i = 0; i < 64; ++i) + result += "@SQ\tSN:ref_" + std::to_string(i) + "\tLN:100\n"; + return result; + }()}; + + bio::map_io::header header{}; + EXPECT_NO_THROW(header.read(many_refs_input)); + + EXPECT_EQ(header.rnames().size(), 64u); + EXPECT_EQ(header.rnames_info().size(), 64u); + EXPECT_EQ(header.rname_to_pos().size(), 64u); +} + +TEST(map_io_header, tag_LN_maximum_value) +{ + // maximum LN value is 2^31-1 + std::string header_str{"@SQ\tSN:ref\tLN:2147483647\n"}; + + bio::map_io::header header{}; + EXPECT_NO_THROW(header.read(header_str)); + + EXPECT_EQ(header.rnames().size(), 1u); + EXPECT_EQ(header.rnames_info()[(header.rname_to_pos().at("ref"))], + (std::tuple{2147483647u, ""})); +} + +TEST(map_io_header_warnings, user_defined_tags) +{ + // user defined tags should not trigger errors, but print warnings to cerr + std::string header_str{ + "@HD\tVN:1.6\tVB:user_tag\tSB:user_tag\tGB:user_tag\tpb:user_tag\n" + "@SQ\tSN:ref2\tLN:243199373\tSB:user_tag\tLB:user_tag\tpb:user_tag\n" + "@RG\tID:U0a_A2_L1\tIB:user_tag\tpb:user_tag\n" + "@PG\tID:qc\tIB:user_tag\tPB:user_tag\tCB:user_tag\tDB:user_tag\tVB:user_tag\tpb:user_tag\n"}; + + std::string expected_cerr{ + "Unsupported SAM header tag in @HD: VB\n" + "Unsupported SAM header tag in @HD: SB\n" + "Unsupported SAM header tag in @HD: GB\n" + "Unsupported SAM header tag in @HD: pb\n" + "Unsupported SAM header tag in @PG: IB\n" + "Unsupported SAM header tag in @PG: PB\n" + "Unsupported SAM header tag in @PG: CB\n" + "Unsupported SAM header tag in @PG: DB\n" + "Unsupported SAM header tag in @PG: VB\n" + "Unsupported SAM header tag in @PG: pb\n"}; + + bio::map_io::header header{}; + testing::internal::CaptureStderr(); + EXPECT_NO_THROW(header.read(header_str)); + EXPECT_EQ(testing::internal::GetCapturedStderr(), expected_cerr); +} + +TEST(map_io_header_errors, invalid_tag_HA) +{ + // invalid header record type: @HA + std::string header_str{"@HA\tthis is not a valid tag\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, invalid_tag_SA) +{ + // invalid header record type: @SA + std::string header_str{"@SA\tthis is not a valid tag\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, invalid_tag_PA) +{ + // invalid header record type: @PA + std::string header_str{"@PA\tthis is not a valid tag\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, invalid_tag_RA) +{ + // invalid header record type: @RA + std::string header_str{"@RA\tthis is not a valid tag\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, invalid_tag_CA) +{ + // invalid header record type: @CA + std::string header_str{"@CA\tthis is not a valid tag\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, invalid_tag_TT) +{ + // invalid header record type: @TT + std::string header_str{"@TT\tthis is not a valid tag\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, missing_tag_VN) +{ + // missing VN tag in @HD + std::string header_str{"@HD\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, missing_tag_SN) +{ + // missing SN tag in @SQ + std::string header_str{"@SQ\tLN:1\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, missing_tag_LN) +{ + // missing LN tag in @SQ + std::string header_str{"@SQ\tSN:ref\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +// do we want this semantic check? +// TEST(map_io_header_errors, tag_LN_cannot_be_0) +// { +// // LN cannot be 0 +// std::string header_str +// { +// "@SQ\tSN:ref\tLN:0\n" +// }; + +// bio::map_io::header header{}; +// EXPECT_THROW(header.read(header_str), bio::format_error); +// } + +// do we want this semantic check? +// TEST(map_io_header_errors, tag_LN_cannot_be_negative) +// { +// // LN cannot be negative +// std::string header_str +// { +// "@SQ\tSN:ref\tLN:-1\n" +// }; + +// bio::map_io::header header{}; +// EXPECT_THROW(header.read(header_str), bio::format_error); +// } + +TEST(map_io_header_errors, tag_LN_overflow) +{ + // LN exceeds maximum value + std::string header_str{"@SQ\tSN:ref\tLN:2147483648\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, missing_tag_RG_ID) +{ + // missing ID tag in @RG + std::string header_str{"@RG\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +TEST(map_io_header_errors, missing_tag_PG_ID) +{ + // missing ID tag in @PG + std::string header_str{"@PG\n"}; + + bio::map_io::header header{}; + EXPECT_THROW(header.read(header_str), bio::format_error); +} + +// write + +// TEST_F(sam_input_test, write_different_header) +// { +// std::ostringstream ostream; + +// auto write_header = [&] () +// { +// seqan3::sam_file_output fout{ostream, seqan3::format_sam{}, seqan3::fields{}}; +// ASSERT_NO_THROW(fout.emplace_back(&header, this->ref_id, 0)); +// }; + +// header.sorting = "unsorted"; +// header.grouping = "query"; + +// write_header(); +// ostream.flush(); +// EXPECT_EQ(ostream.str(), +// "@HD\tVN:1.6\tSO:unsorted\tGO:query\n@SQ\tSN:ref\tLN:34\n*\t0\tref\t1\t0\t*\t*\t0\t0\t*\t*\n"); + +// ostream = std::ostringstream{}; +// header.sorting = "queryname"; +// header.grouping = "reference"; + +// write_header(); +// ostream.flush(); +// EXPECT_EQ(ostream.str(), +// "@HD\tVN:1.6\tSO:queryname\tGO:reference\n@SQ\tSN:ref\tLN:34\n*\t0\tref\t1\t0\t*\t*\t0\t0\t*\t*\n"); + +// ostream = std::ostringstream{}; +// header.sorting = "coordinate"; +// header.subsorting = "query"; + +// write_header(); +// ostream.flush(); +// EXPECT_EQ(ostream.str(), +// "@HD\tVN:1.6\tSO:coordinate\tSS:query\tGO:reference\n@SQ\tSN:ref\tLN:34\n*\t0\tref\t1\t0\t*\t*\t0\t0\t*\t*\n"); +// } diff --git a/test/unit/map_io/map_io_reader_test.cpp b/test/unit/map_io/map_io_reader_test.cpp new file mode 100644 index 0000000..cdc6e51 --- /dev/null +++ b/test/unit/map_io/map_io_reader_test.cpp @@ -0,0 +1,286 @@ +// ----------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik +// Copyright (c) 2020-2021, deCODE Genetics +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/b.i.o./blob/master/LICENSE +// ----------------------------------------------------------------------------------------------------- + +#include +#include + +#include + +#include +#include +#include +#include + +#include + +#include "data.hpp" + +TEST(map_io_reader, concepts) +{ + using t = bio::map_io::reader<>; + EXPECT_TRUE((std::ranges::input_range)); + + using ct = bio::map_io::reader<> const; + // not const-iterable + EXPECT_FALSE((std::ranges::input_range)); +} + +void map_io_reader_filename_constructor(bool ext_check, auto &&... args) +{ + /* just the filename */ + { + seqan3::test::tmp_filename filename{"map_io_reader_constructor.sam"}; + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + + EXPECT_NO_THROW((bio::map_io::reader{filename.get_path(), std::forward(args)...})); + } + + // correct format check is done by tests of that format + + /* non-existent file */ + { + EXPECT_THROW((bio::map_io::reader{"/dev/nonexistant/foobarOOO", std::forward(args)...}), + bio::file_open_error); + } + + /* wrong extension */ + if (ext_check) + { + seqan3::test::tmp_filename filename{"map_io_reader_constructor.xyz"}; + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + EXPECT_THROW((bio::map_io::reader{filename.get_path(), std::forward(args)...}), + bio::unhandled_extension_error); + } +} + +TEST(map_io_reader, constructor1_just_filename) +{ + map_io_reader_filename_constructor(true); + EXPECT_TRUE((std::same_as>)); +} + +TEST(map_io_reader, constructor1_with_opts) +{ + bio::map_io::reader_options opt{.field_types = bio::map_io::field_types_sam<>}; + using control_t = bio::map_io::reader, + std::remove_cvref_t)>, + seqan3::type_list>; + + map_io_reader_filename_constructor(true, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(map_io_reader, constructor2_just_filename_direct_format) +{ + map_io_reader_filename_constructor(false, bio::sam{}); + EXPECT_TRUE((std::same_as>)); +} + +TEST(map_io_reader, constructor2_with_opts_direct_format) +{ + bio::map_io::reader_options opt{.field_types = bio::map_io::field_types_sam<>}; + using control_t = bio::map_io::reader, + std::remove_cvref_t)>, + seqan3::type_list>; + + map_io_reader_filename_constructor(false, bio::sam{}, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(map_io_reader, constructor2_just_filename_format_variant) +{ + std::variant var{}; + + map_io_reader_filename_constructor(false, var); + EXPECT_TRUE((std::same_as>)); +} + +TEST(map_io_reader, constructor2_with_opts_format_variant) +{ + std::variant var{}; + bio::map_io::reader_options opt{.field_types = bio::map_io::field_types_sam<>}; + using control_t = bio::map_io::reader, + std::remove_cvref_t)>, + seqan3::type_list>; + + map_io_reader_filename_constructor(false, var, std::move(opt)); + EXPECT_TRUE((std::same_as)); +} + +TEST(map_io_reader, constructor3) +{ + std::istringstream str; + + EXPECT_NO_THROW((bio::map_io::reader{str, bio::sam{}})); + EXPECT_TRUE((std::same_as>)); +} + +TEST(map_io_reader, constructor3_with_opts) +{ + std::istringstream str; + bio::map_io::reader_options opt{.field_types = bio::map_io::field_types_sam<>}; + using control_t = bio::map_io::reader, + std::remove_cvref_t)>, + seqan3::type_list>; + + EXPECT_NO_THROW((bio::map_io::reader{str, bio::sam{}, opt})); + EXPECT_TRUE((std::same_as)); +} + +TEST(map_io_reader, constructor4) +{ + std::istringstream str; + + EXPECT_NO_THROW((bio::map_io::reader{std::move(str), bio::sam{}})); + EXPECT_TRUE((std::same_as>)); +} + +TEST(map_io_reader, constructor4_with_opts) +{ + std::istringstream str; + bio::map_io::reader_options opt{.field_types = bio::map_io::field_types_sam<>}; + using control_t = bio::map_io::reader, + std::remove_cvref_t)>, + seqan3::type_list>; + + EXPECT_NO_THROW((bio::map_io::reader{std::move(str), bio::sam{}, opt})); + EXPECT_TRUE((std::same_as)); +} + +TEST(map_io_reader, iteration) +{ + { + std::istringstream str{static_cast(input)}; + bio::map_io::reader reader{str, bio::sam{}}; + + EXPECT_EQ(std::ranges::distance(reader), 3); + } + + { + std::istringstream str{static_cast(input)}; + bio::map_io::reader reader{str, bio::sam{}}; + + size_t count = 0; + for (auto & rec : reader) + { + ++count; + EXPECT_TRUE(rec.id().starts_with("read")); + // only very basic check here, rest in format test + } + EXPECT_EQ(count, 3); + } +} + +TEST(map_io_reader, empty_file) +{ + { + seqan3::test::tmp_filename filename{"map_io_reader_constructor.sam"}; + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + + bio::map_io::reader reader{filename.get_path()}; + + EXPECT_THROW(reader.begin(), bio::file_open_error); + } +} + +TEST(map_io_reader, empty_stream) +{ + { + std::istringstream str{""}; + bio::map_io::reader reader{str, bio::sam{}}; + + EXPECT_THROW(reader.begin(), bio::file_open_error); + } +} + +// TEST(map_io_reader, custom_field_types) +// { +// bio::map_io::reader_options opt{.field_types = bio::map_io::field_types}; + +// std::istringstream str{static_cast(input)}; +// bio::map_io::reader reader{str, bio::sam{}, opt}; + +// EXPECT_TRUE((std::same_as &>)); +// EXPECT_TRUE((std::same_as)); +// } + +TEST(map_io_reader, custom_field_ids_structured_bindings) +{ + bio::map_io::reader_options opt{.field_ids = bio::vtag, + .field_types = bio::ttag}; + + std::istringstream str{static_cast(input)}; + bio::map_io::reader reader{str, bio::sam{}, opt}; + + for (auto & [seq, id] : reader) + EXPECT_TRUE(id.starts_with("read")); +} + +TEST(map_io_reader, decompression_filename) +{ + seqan3::test::tmp_filename filename{"map_io_reader.sam.gz"}; + + { + std::ofstream filecreator{filename.get_path(), std::ios::out | std::ios::binary}; + bio::detail::fast_ostreambuf_iterator it{filecreator}; + it.write_range(input_bgzipped); + } + + bio::map_io::reader reader{filename.get_path()}; + + size_t count = 0; + for (auto & rec : reader) + { + ++count; + EXPECT_TRUE(rec.id().starts_with("read")); + // only very basic check here, rest in format test + } + EXPECT_EQ(count, 3); +} + +TEST(map_io_reader, decompression_stream) +{ + std::istringstream str{static_cast(input_bgzipped)}; + + bio::map_io::reader reader{str, bio::sam{}}; + + size_t count = 0; + for (auto & rec : reader) + { + ++count; + EXPECT_TRUE(rec.id().starts_with("read")); + // only very basic check here, rest in format test + } + EXPECT_EQ(count, 3); +} + +TEST(map_io_reader, get_header) +{ + // get header before calling begin() + { + std::istringstream str{static_cast(input)}; + bio::map_io::reader reader{str, bio::sam{}}; + + bio::map_io::header const & hdr = reader.header(); + + EXPECT_EQ(hdr.format_version, "1.6"); + } + + // get header after calling begin() + { + std::istringstream str{static_cast(input)}; + bio::map_io::reader reader{str, bio::sam{}}; + + auto it = reader.begin(); + EXPECT_EQ(it->id(), "read1"); + + bio::map_io::header const & hdr = reader.header(); + + EXPECT_EQ(hdr.format_version, "1.6"); + } +}