diff --git a/include/bio/format/sam_input_handler.hpp b/include/bio/format/sam_input_handler.hpp index 43cfb32..4c2b1a5 100644 --- a/include/bio/format/sam_input_handler.hpp +++ b/include/bio/format/sam_input_handler.hpp @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -60,13 +61,13 @@ class format_input_handler : public format_input_handler_base> header; + map_io::header header; //!\brief Lowlevel stream iterator. lowlevel_iterator file_it; //!\brief Cache of the chromosome string. - std::string last_chrom; + std::string last_rname; //!\brief Cache of the chromosome index. - int32_t last_chrom_idx = -1; + int32_t last_rname_idx = -1; //!\brief Current line number in file. size_t line = 0; //!\brief Whether to print print_warnings or not. @@ -80,6 +81,16 @@ class format_input_handler : public format_input_handler_base : public format_input_handler_base(raw_record); if (raw_field != "*") - parse_field_aux(raw_field, parsed_field); + { + size_t rname_pos; + + if (auto it = header.rname_to_pos().find(raw_field); it == header.rname_to_pos().end()) + { // rname name was not in header, insert! + header.push_back_rname(raw_field, raw_field.size(), ""); + + rname_pos = header.rnames().size() - 1; + + warning("The reference sequence name \"", raw_field, "\" is not present in the header."); + } + else + { + rname_pos = it->second; + } - // todo insert into header + if constexpr (std::integral>) + parsed_field = rname_pos; + else + parsed_field = header.rnames()[rname_pos]; + } } /* POS, MAPQ are handled correctly by default, unless we want pos to be read into an std:optional */ @@ -149,7 +178,7 @@ class format_input_handler : public format_input_handler_base : public format_input_handler_basesecond; + } + + if constexpr (std::integral>) + parsed_field = rname_pos; + else + parsed_field = header.rnames()[rname_pos]; } } @@ -195,10 +242,11 @@ class format_input_handler : public format_input_handler_base const & /**/, map_io::record_private_data & parsed_field) - // { - // parsed_field = map_io::record_private_data{.header_ptr = &header}; - // } + //!\brief Overload for parsing the private data. + void parse_field(vtag_t const & /**/, map_io::record_private_data & parsed_field) + { + parsed_field = map_io::record_private_data{.header_ptr = &header}; + } public: format_input_handler() = default; @@ -232,7 +280,7 @@ class format_input_handler : public format_input_handler_base>{}; + header = map_io::header{}; header.read(header_string); } diff --git a/include/bio/map_io/header.hpp b/include/bio/map_io/header.hpp index 964b134..d2d69dc 100644 --- a/include/bio/map_io/header.hpp +++ b/include/bio/map_io/header.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -41,7 +42,6 @@ namespace bio::map_io * * TODO */ -template > class header { public: @@ -56,63 +56,63 @@ class header header & operator=(header const &) = default; //!< Defaulted. header & operator=(header &&) = default; //!< Defaulted. - /*!\brief Construct from a range of reference ids which redirects the `ref_ids_ptr` member (non-owning). + /*!\brief Construct from a range of reference ids. * \param[in] The plain text header. * \param[in] ref_ids The range over reference ids to redirect the pointer at. */ + template // todo: restrict value type to be std::string_view constructible header(ref_ids_type & ref_ids) : - ref_ids_ptr{&ref_ids, ref_ids_deleter_noop}, - reference_names_present_in_header{true} - {} + reference_names_given_on_construction{true} + { + for (auto & id : ref_ids) + reference_names.emplace_back(id); + } - /*!\brief Construct from a rvalue range of reference ids which is moved into the `ref_ids_ptr` (owning). + /*!\brief Construct from a rvalue range of reference ids. Ids are copied over. * \param[in] ref_ids The range over reference ids to own. */ + template // todo: restrict value type to be std::string_view constructible header(ref_ids_type && ref_ids) : - ref_ids_ptr{new ref_ids_type{std::move(ref_ids)}, ref_ids_deleter_default}, - reference_names_present_in_header{true} - {} + reference_names_given_on_construction{true} + { + for (auto & id : ref_ids) + { + owned_reference_names.push_back(id); + reference_names.emplace_back(owned_reference_names.back()); + } + } //!\} //!\brief Defaulted three-way comparison. auto operator<=>(header const &) const = default; inline void read(std::string_view header_string); + private: - //!\brief The type of the internal ref_ids pointer. Allows dynamically setting ownership management. - using ref_ids_ptr_t = std::unique_ptr>; - //!\brief Stream deleter that does nothing (no ownership assumed). - static void ref_ids_deleter_noop(ref_ids_type *) {} - //!\brief Stream deleter with default behaviour (ownership assumed). - static void ref_ids_deleter_default(ref_ids_type * ptr) { delete ptr; } - //!\brief The key's type of ref_dict. - using key_type = std::conditional_t>, - std::span const>, - seqan3::type_reduce_t>>; - //!\brief The pointer to reference ids information (non-owning if reference information is given). - ref_ids_ptr_t ref_ids_ptr{new ref_ids_type{}, ref_ids_deleter_default}; - //!\brief Whether reference names are present in the current header. - bool reference_names_present_in_header{false}; - - //!\brief Custom hash function since std::hash is not defined for all range types (e.g. std::span). - struct key_hasher - { - //!\brief Hash a key. - template - size_t operator()(key_t && key) const noexcept - { - using char_t = std::ranges::range_value_t; - size_t result{0}; - std::hash h{}; - for (char_t character : key) - { - result *= 0x8F3F73B5CF1C9ADE; - result += h(character); - } - return result; - } - }; + //!\brief In case no reference information is given on construction, the reference names are stored here. + std::deque owned_reference_names; + + //!\brief The reference sequence names. + std::vector reference_names; + + std::vector> reference_names_info{}; + //!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range. + std::unordered_map reference_name_to_pos{}; + + //!\brief Whether reference sequence names were given to the header on construction. + bool reference_names_given_on_construction{false}; + + /* [[noreturn]] compiler says this returns something...? */ void warning(auto const &... messages) const + { + // if (print_warnings) + // { + // seqan3::debug_stream << "[B.I.O sam format header warning in line " << line << "] "; // todo line numbers + seqan3::debug_stream << "[B.I.O sam format header warning] "; + (seqan3::debug_stream << ... << messages); + seqan3::debug_stream << std::endl; + // } + } public: /*!\name [@HD] File-level meta data * \brief You can directly edit these member variables. @@ -125,9 +125,10 @@ class header //!\} /*!\name [@SQ] Reference sequence dictionary - * \brief You can directly edit these member variables. + * \brief You **CANNOT** directly edit these member variables. Please use the respective modifiers. * \{ */ + /*!\brief [@SQ SN] Reference sequence names * * \details @@ -146,9 +147,9 @@ class header * In this case, the reference information is parsed from the records field::ref_id and stored in the header. * This member function then provides access to the unique list of reference names encountered in the records. */ - ref_ids_type & ref_ids() + std::vector const & rnames() { - return *ref_ids_ptr; + return reference_names; } /*!\brief [@SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information @@ -182,10 +183,25 @@ class header * * **UR:** URI of the sequence. This value may start with one of the standard protocols, e.g http: or ftp:. If * it does not start with one of these protocols, it is assumed to be a file-system path */ - std::vector> ref_id_info{}; + std::vector> const & rnames_info() + { + return reference_names_info; + } + + //!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range. + std::unordered_map const & rname_to_pos() + { + return reference_name_to_pos; + } - //!\brief The mapping of reference id to position in the ref_ids() range and the ref_id_info range. - std::unordered_map ref_dict{}; + //!\brief Append a new rname entry with it's length and extra information and update reference_name_to_pos. + void push_back_rname(std::string_view const rname, int32_t const length, std::string_view const extra_info) + { + owned_reference_names.emplace_back(rname); + reference_names.emplace_back(owned_reference_names.back()); + reference_names_info.emplace_back(length, extra_info); + reference_name_to_pos[reference_names.back()] = reference_names.size() - 1; + } //!\} /*!\name [@RG] Read groups @@ -270,8 +286,7 @@ class header * not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual * error. */ -template -void header::read(std::string_view header_string) +void header::read(std::string_view header_string) { auto stream_view = header_string | seqan3::views::single_pass_input; @@ -379,7 +394,7 @@ void header::read(std::string_view header_string) case make_tag('S', 'Q'): // SQ (sequence dictionary) tag { - std::ranges::range_value_t id; + std::string id; std::tuple info{}; bool sequence_length_was_present{}; @@ -416,30 +431,30 @@ void header::read(std::string_view header_string) if (!sequence_length_was_present) throw format_error{"The required LN tag in @SQ is missing."}; - if (reference_names_present_in_header) - { // If reference information was given, the ids exist and we can fill ref_dict directly. - auto id_it = ref_dict.find(id); + if (!reference_names_given_on_construction) + { // Reference information was not given by the user but needs to be filled from the header + push_back_rname(id, std::get<0>(info), std::get<1>(info)); + } + else + { // If reference information was given we check them against the once in the header + auto id_it = reference_name_to_pos.find(id); - if (id_it == ref_dict.end()) - throw format_error{seqan3::detail::to_string("Unknown reference name '", id, "' found in SAM header ", - "(header.ref_ids(): "/* , ref_ids(), TODO */").")}; + if (id_it == reference_name_to_pos.end()) + { + warning("The reference sequence name \"", id, "\" was present in the header but not in the " + "user provided rnames."); - auto & given_ref_info = ref_id_info[id_it->second]; + push_back_rname(id, std::get<0>(info), std::get<1>(info)); + } + else + { + assert(id_it->second < static_castsecond)>(reference_names_info.size())); - if (std::get<0>(given_ref_info) != std::get<0>(info)) - throw format_error{"Provided and header-based reference length differ."}; + if (std::get<0>(reference_names_info[id_it->second]) != std::get<0>(info)) + warning("Provided and header-based reference length differ for rname :\"", id, "\"."); - ref_id_info[id_it->second] = std::move(info); - } - else - { // If not, we need to update the ids first and fill the reference dictionary afterwards. - static_assert(!seqan3::detail::is_type_specialisation_of_v, - "The range over reference ids must be of type std::deque such that pointers are not " - "invalidated."); - - ref_ids().push_back(id); - ref_id_info.push_back(info); - ref_dict[(ref_ids())[(ref_ids()).size() - 1]] = (ref_ids()).size() - 1; + reference_names_info[id_it->second] = info; // update rname information + } } break; } @@ -552,4 +567,15 @@ void header::read(std::string_view header_string) } } +//!\brief A datastructure that contains private data of sam IO records. +//!\ingroup var_io +struct record_private_data +{ + //!\privatesection + //!\brief Pointer to the header + header const * header_ptr = nullptr; + + friend bool operator==(record_private_data const &, record_private_data const &) = default; //!< Defaulted. +}; + } // namespace bio::map_io diff --git a/include/bio/map_io/misc.hpp b/include/bio/map_io/misc.hpp index ef96fcb..91b3095 100644 --- a/include/bio/map_io/misc.hpp +++ b/include/bio/map_io/misc.hpp @@ -32,6 +32,7 @@ inline constexpr auto default_field_ids = vtag; + field::qual, + field::_private>; } // namespace bio::map_io diff --git a/include/bio/map_io/reader_options.hpp b/include/bio/map_io/reader_options.hpp index b5c2fe9..bab8671 100644 --- a/include/bio/map_io/reader_options.hpp +++ b/include/bio/map_io/reader_options.hpp @@ -66,8 +66,8 @@ inline constexpr auto field_types_sam = int32_t, // field::pnext, int32_t, // field::tlen, decltype(std::string_view{} | seqan3::views::char_to), // field::seq - decltype(std::string_view{} | seqan3::views::char_to)/*,// field::qual - record_private_data */>; // field::_private + decltype(std::string_view{} | seqan3::views::char_to), // field::qual + record_private_data>; // field::_private //!\} /*!\brief Options that can be used to configure the behaviour of bio::map_io::reader. diff --git a/test/unit/format/sam_file_format_test_template.hpp b/test/unit/format/sam_file_format_test_template.hpp index 5b1dc2c..42d8359 100644 --- a/test/unit/format/sam_file_format_test_template.hpp +++ b/test/unit/format/sam_file_format_test_template.hpp @@ -31,6 +31,7 @@ using seqan3::operator""_tag; // global variables for reuse bio::map_io::reader_options default_options{}; +constexpr std::string_view warning_prefix{"[B.I.O sam format warning in line "}; struct sam_file_data : public ::testing::Test { @@ -241,106 +242,55 @@ TYPED_TEST_P(sam_file_read, select_fields) } } -// TYPED_TEST_P(sam_file_read, read_in_alignment_only_with_ref) -// { -// { -// typename TestFixture::stream_type istream{this->simple_three_reads_input}; -// seqan3::sam_file_input fin{istream, -// this->ref_ids, -// this->ref_sequences, -// TypeParam{}, -// seqan3::fields{}}; - -// size_t i{0}; -// for (auto & [alignment] : fin) -// { -// EXPECT_RANGE_EQ(std::get<0>(alignment), std::get<0>(this->alignments[i])); -// EXPECT_RANGE_EQ(std::get<1>(alignment), std::get<1>(this->alignments[i])); -// ++i; -// } -// } - -// { // empty cigar -// typename TestFixture::stream_type istream{this->empty_cigar}; -// seqan3::sam_file_input fin{istream, -// this->ref_ids, -// this->ref_sequences, -// TypeParam{}, -// seqan3::fields{}}; +TYPED_TEST_P(sam_file_read, warning_rname_not_in_header) +{ + typename TestFixture::stream_type istream{this->verbose_reads_input}; -// EXPECT_TRUE(std::ranges::empty(std::get<0>((*fin.begin()).alignment()))); -// EXPECT_TRUE(std::ranges::empty(std::get<1>((*fin.begin()).alignment()))); -// } -// } + constexpr auto fid = bio::vtag; + constexpr auto ftype = bio::ttag; -// TYPED_TEST_P(sam_file_read, read_in_alignment_only_without_ref) -// { -// { -// typename TestFixture::stream_type istream{this->simple_three_reads_input}; -// seqan3::sam_file_input fin{istream, TypeParam{}, seqan3::fields{}}; + using record_t = bio::record, std::remove_cvref_t>; + bio::format_input_handler input_handler{istream, default_options}; + record_t rec; -// size_t i{0}; -// for (auto & [alignment] : fin) -// { -// EXPECT_RANGE_EQ(std::get<1>(alignment), std::get<1>(this->alignments[i++])); -// auto & ref_aln = std::get<0>(alignment); -// EXPECT_THROW((ref_aln[0]), std::logic_error); // access on a dummy seq is not allowed -// } -// } + std::string warning_msg{warning_prefix.data() + std::string{"1] The reference sequence name \""} + this->ref_id + + "\" is not present in the header.\n"}; -// { // empty cigar -// typename TestFixture::stream_type istream{this->empty_cigar}; -// seqan3::sam_file_input fin{istream, TypeParam{}, seqan3::fields{}}; + for (unsigned i = 0; i < 3; ++i) + { + testing::internal::CaptureStderr(); + input_handler.parse_next_record_into(rec); + EXPECT_EQ(testing::internal::GetCapturedStderr(), ((i == 0) ? warning_msg : std::string{})); -// EXPECT_TRUE(std::ranges::empty(std::get<0>((*fin.begin()).alignment()))); -// EXPECT_TRUE(std::ranges::empty(std::get<1>((*fin.begin()).alignment()))); -// } -// } + EXPECT_EQ(rec.rname(), this->ref_id); + EXPECT_EQ(rec.pos(), this->positions[i]); + } +} -// TYPED_TEST_P(sam_file_read, read_mate_but_not_ref_id_with_ref) -// { -// { /*with reference information*/ -// typename TestFixture::stream_type istream{this->simple_three_reads_input}; -// seqan3::sam_file_input fin{istream, -// this->ref_ids, -// this->ref_sequences, -// TypeParam{}, -// seqan3::fields{}}; - -// size_t i{0}; -// for (auto & [mate] : fin) -// EXPECT_EQ(mate, this->mates[i++]); -// } -// } +TYPED_TEST_P(sam_file_read, warning_rnext_not_in_header) +{ + typename TestFixture::stream_type istream{this->verbose_reads_input}; -// TYPED_TEST_P(sam_file_read, read_mate_but_not_ref_id_without_ref) -// { -// std::tuple, std::optional, int32_t> mate; + constexpr auto fid = bio::vtag; + constexpr auto ftype = bio::ttag; -// { /*no reference information*/ -// typename TestFixture::stream_type istream{this->simple_three_reads_input}; -// seqan3::sam_file_input fin{istream, TypeParam{}, seqan3::fields{}}; + using record_t = bio::record, std::remove_cvref_t>; + bio::format_input_handler input_handler{istream, default_options}; + record_t rec; -// size_t i{0}; -// for (auto & [mate] : fin) -// EXPECT_EQ(mate, this->mates[i++]); -// } -// } + std::string warning_msg{warning_prefix.data() + std::string{"1] The mates reference sequence name \""} + + this->ref_id + "\" is not present in the header.\n"}; -// TYPED_TEST_P(sam_file_read, format_error_ref_id_not_in_reference_information) -// { -// { // with reference information given -// typename TestFixture::stream_type istream{this->unknown_ref}; -// seqan3::sam_file_input fin{istream, this->ref_ids, this->ref_sequences, TypeParam{}}; -// EXPECT_THROW((fin.begin()), seqan3::format_error); -// } + for (unsigned i = 0; i < 3; ++i) + { + testing::internal::CaptureStderr(); + input_handler.parse_next_record_into(rec); + EXPECT_EQ(testing::internal::GetCapturedStderr(), ((i == 0) ? warning_msg : std::string{})); -// { // with reference information in the header -// typename TestFixture::stream_type istream{this->unknown_ref_header}; -// seqan3::sam_file_input fin{istream, TypeParam{}}; -// EXPECT_THROW((fin.begin()), seqan3::format_error); -// } -// } + EXPECT_EQ(rec.rnext(), this->ref_id); + EXPECT_EQ(rec.pnext(), std::get<1>(this->mates[i])); + } +} // TYPED_TEST_P(sam_file_read, format_error_uneven_hexadecimal_tag) // { @@ -349,16 +299,6 @@ TYPED_TEST_P(sam_file_read, select_fields) // EXPECT_THROW((fin.begin()), seqan3::format_error); // } -// // https://github.com/seqan/seqan3/pull/2423 -// TYPED_TEST_P(sam_file_read, issue2423) -// { -// typename TestFixture::stream_type istream{this->many_refs}; -// seqan3::sam_file_input fin{istream, TypeParam{}}; -// ASSERT_NO_THROW(fin.begin()); - -// EXPECT_EQ(fin.header().ref_id_info.size(), 64u); -// EXPECT_EQ(fin.header().ref_dict.size(), 64u); -// } // // ---------------------------------------------------------------------------- // // sam_file_write @@ -651,7 +591,12 @@ TYPED_TEST_P(sam_file_read, select_fields) // seqan3::format_error); // } -REGISTER_TYPED_TEST_SUITE_P(sam_file_read, full_data_set, all_missing_data, select_fields); +REGISTER_TYPED_TEST_SUITE_P(sam_file_read, + full_data_set, + all_missing_data, + select_fields, + warning_rname_not_in_header, + warning_rnext_not_in_header); // REGISTER_TYPED_TEST_SUITE_P(sam_file_read, // input_concept, diff --git a/test/unit/format/sam_input_test.cpp b/test/unit/format/sam_input_test.cpp index 286a736..2ba30f6 100644 --- a/test/unit/format/sam_input_test.cpp +++ b/test/unit/format/sam_input_test.cpp @@ -50,6 +50,7 @@ read3 43 ref 3 63 1S1M1P1M1I1M1I1D1M1S ref 10 300 GGAGTATA !!*+,-./ )"}; std::string verbose_reads_input{ + // "@HD\tVN:1.6\n@SQ\tSN:ref\tLN:34\n" // todo: check why adding this segfaults, adding this removes warnings "read1\t41\tref\t1\t61\t1S1M1D1M1I\t=\t10\t300\tACGT\t!##$\taa:A:c" "\tNM:i:-7" "\tAS:i:2" @@ -75,14 +76,6 @@ read3 43 ref 3 63 1S1M1P1M1I1M1I1D1M1S ref 10 300 GGAGTATA !!*+,-./ std::string unknown_ref_header{"@HD\tVN:1.6\n@SQ\tSN:ref\tLN:34\n*\t0\tunknown_ref\t1\t0\t4M\t*\t0\t0\tAAAA\t*\n"}; - std::string many_refs{[] () - { - std::string result{"@HD\tVN:1.6\n"}; - for (size_t i = 0; i < 64; ++i) - result += "@SQ\tSN:ref_" + std::to_string(i) + "\tLN:100\n"; - return result; - }()}; - // ----------------------------------------------------------------------------------------------------------------- // formatted output // ----------------------------------------------------------------------------------------------------------------- diff --git a/test/unit/map_io/header_test.cpp b/test/unit/map_io/header_test.cpp index ccf92bf..25ad031 100644 --- a/test/unit/map_io/header_test.cpp +++ b/test/unit/map_io/header_test.cpp @@ -52,9 +52,9 @@ R"(@HD VN:1.6 SO:coordinate SS:coordinate:queryname GO:none std::string id1{"ref"}; std::string id2{"ref2"}; - EXPECT_EQ(header.ref_id_info.size(), 2u); - EXPECT_EQ(header.ref_id_info[(header.ref_dict[id1])], (std::tuple{249250621u, ""})); - EXPECT_EQ(header.ref_id_info[(header.ref_dict[id2])], (std::tuple{243199373u, "AS:hs37d5"})); + EXPECT_EQ(header.rnames_info().size(), 2u); + EXPECT_EQ(header.rnames_info()[(header.rname_to_pos().at(id1))], (std::tuple{249250621u, ""})); + EXPECT_EQ(header.rnames_info()[(header.rname_to_pos().at(id2))], (std::tuple{243199373u, "AS:hs37d5"})); EXPECT_EQ(header.read_groups[0], (std::pair{"U0a_A2_L1", "PL:illumina\tPU:1\tLB:1\tSM:NA12878"})); @@ -92,8 +92,8 @@ TEST(map_io_header, independent_order) EXPECT_EQ(header.program_infos[0].command_line_call, "novoalign -d /hs37d5.ndx -f /file.fastq.gz"); std::string id2{"ref2"}; - EXPECT_EQ(header.ref_id_info.size(), 1u); - EXPECT_EQ(header.ref_id_info[(header.ref_dict[id2])], (std::tuple{243199373u, "AS:hs37d5"})); + EXPECT_EQ(header.rnames_info().size(), 1u); + EXPECT_EQ(header.rnames_info()[(header.rname_to_pos().at(id2))], (std::tuple{243199373u, "AS:hs37d5"})); EXPECT_EQ(header.read_groups[0], (std::pair{"U0a_A2_L1", "LB:1\tSM:NA12878\tPL:illumina\tPU:1"})); @@ -101,6 +101,25 @@ TEST(map_io_header, independent_order) EXPECT_EQ(header.comments[0], "Tralalalalalala this is a comment"); } +TEST(map_io_header, issue2423) +{ + // former seqan3 issue https://github.com/seqan/seqan3/pull/2423 + std::string many_refs_input{[] () + { + std::string result{"@HD\tVN:1.6\n"}; + for (size_t i = 0; i < 64; ++i) + result += "@SQ\tSN:ref_" + std::to_string(i) + "\tLN:100\n"; + return result; + }()}; + + bio::map_io::header header{}; + EXPECT_NO_THROW(header.read(many_refs_input)); + + EXPECT_EQ(header.rnames().size(), 64u); + EXPECT_EQ(header.rnames_info().size(), 64u); + EXPECT_EQ(header.rname_to_pos().size(), 64u); +} + TEST(map_io_header_warnings, user_defined_tags) { // user defined tags should not trigger errors, but print warnings to cerr