Skip to content

Commit

Permalink
header reference works
Browse files Browse the repository at this point in the history
  • Loading branch information
smehringer committed Feb 1, 2022
1 parent 6e5f334 commit df0dfdc
Show file tree
Hide file tree
Showing 7 changed files with 231 additions and 199 deletions.
72 changes: 60 additions & 12 deletions include/bio/format/sam_input_handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <string_view>
#include <vector>

#include <seqan3/core/debug_stream.hpp>
#include <seqan3/utility/type_list/traits.hpp>
#include <seqan3/io/sam_file/detail/cigar.hpp>

Expand Down Expand Up @@ -60,13 +61,13 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
//!\brief The raw record.
raw_record_type raw_record;
//!\brief The header.
map_io::header<std::vector<std::string>> header;
map_io::header header;
//!\brief Lowlevel stream iterator.
lowlevel_iterator file_it;
//!\brief Cache of the chromosome string.
std::string last_chrom;
std::string last_rname;
//!\brief Cache of the chromosome index.
int32_t last_chrom_idx = -1;
int32_t last_rname_idx = -1;
//!\brief Current line number in file.
size_t line = 0;
//!\brief Whether to print print_warnings or not.
Expand All @@ -80,6 +81,16 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
throw format_error{message};
}

/* [[noreturn]] compiler says this returns something...? */ void warning(auto const &... messages) const
{
if (print_warnings)
{
seqan3::debug_stream << "[B.I.O sam format warning in line " << line << "] ";
(seqan3::debug_stream << ... << messages);
seqan3::debug_stream << std::endl;
}
}

void read_raw_record()
{
++line;
Expand Down Expand Up @@ -126,9 +137,27 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
std::string_view raw_field = get<field::rname>(raw_record);

if (raw_field != "*")
parse_field_aux(raw_field, parsed_field);
{
size_t rname_pos;

if (auto it = header.rname_to_pos().find(raw_field); it == header.rname_to_pos().end())
{ // rname name was not in header, insert!
header.push_back_rname(raw_field, raw_field.size(), "");

rname_pos = header.rnames().size() - 1;

warning("The reference sequence name \"", raw_field, "\" is not present in the header.");
}
else
{
rname_pos = it->second;
}

// todo insert into header
if constexpr (std::integral<std::remove_cvref_t<parsed_field_t>>)
parsed_field = rname_pos;
else
parsed_field = header.rnames()[rname_pos];
}
}

/* POS, MAPQ are handled correctly by default, unless we want pos to be read into an std:optional */
Expand All @@ -149,7 +178,7 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
auto const res = std::from_chars(ptr, end, cigar_count); // reads number up to next chatacter

if (res.ec != std::errc{}) // parse number
throw format_error{"Corrupted cigar string encountered"}; // todo, how in b.i.o?
error("Corrupted cigar string encountered");

ptr = res.ptr + 1; // skip cigar operation character

Expand All @@ -169,7 +198,25 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
if (raw_field == "=")
raw_field = get<field::rname>(raw_record);

parse_field_aux(raw_field, parsed_field);
size_t rname_pos;

if (auto it = header.rname_to_pos().find(raw_field); it == header.rname_to_pos().end())
{ // rname name was not in header, insert!
header.push_back_rname(raw_field, raw_field.size(), "");

rname_pos = header.rnames().size() - 1;

warning("The mates reference sequence name \"", raw_field, "\" is not present in the header.");
}
else
{
rname_pos = it->second;
}

if constexpr (std::integral<std::remove_cvref_t<parsed_field_t>>)
parsed_field = rname_pos;
else
parsed_field = header.rnames()[rname_pos];
}
}

Expand All @@ -195,10 +242,11 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
parse_field_aux(raw_field, parsed_field); // reading into e.g. dna4 vector
}

// void parse_field(vtag_t<field::_private> const & /**/, map_io::record_private_data & parsed_field)
// {
// parsed_field = map_io::record_private_data{.header_ptr = &header};
// }
//!\brief Overload for parsing the private data.
void parse_field(vtag_t<field::_private> const & /**/, map_io::record_private_data & parsed_field)
{
parsed_field = map_io::record_private_data{.header_ptr = &header};
}

public:
format_input_handler() = default;
Expand Down Expand Up @@ -232,7 +280,7 @@ class format_input_handler<sam> : public format_input_handler_base<format_input_
else if (header_string.ends_with("\n"))
header_string = header_string.substr(0, header_string.size() - 1);

header = map_io::header<std::vector<std::string>>{};
header = map_io::header{};
header.read(header_string);
}

Expand Down
168 changes: 97 additions & 71 deletions include/bio/map_io/header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <seqan3/io/sam_file/detail/cigar.hpp>
#include <seqan3/utility/views/single_pass_input.hpp>
#include <seqan3/utility/views/type_reduce.hpp>
#include <seqan3/core/debug_stream.hpp>

#include <bio/exception.hpp>
#include <bio/misc.hpp>
Expand All @@ -41,7 +42,6 @@ namespace bio::map_io
*
* TODO
*/
template <typename ref_ids_type = std::vector<std::string>>
class header
{
public:
Expand All @@ -56,63 +56,63 @@ class header
header & operator=(header const &) = default; //!< Defaulted.
header & operator=(header &&) = default; //!< Defaulted.

/*!\brief Construct from a range of reference ids which redirects the `ref_ids_ptr` member (non-owning).
/*!\brief Construct from a range of reference ids.
* \param[in] The plain text header.
* \param[in] ref_ids The range over reference ids to redirect the pointer at.
*/
template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
header(ref_ids_type & ref_ids) :
ref_ids_ptr{&ref_ids, ref_ids_deleter_noop},
reference_names_present_in_header{true}
{}
reference_names_given_on_construction{true}
{
for (auto & id : ref_ids)
reference_names.emplace_back(id);
}

/*!\brief Construct from a rvalue range of reference ids which is moved into the `ref_ids_ptr` (owning).
/*!\brief Construct from a rvalue range of reference ids. Ids are copied over.
* \param[in] ref_ids The range over reference ids to own.
*/
template <typename ref_ids_type> // todo: restrict value type to be std::string_view constructible
header(ref_ids_type && ref_ids) :
ref_ids_ptr{new ref_ids_type{std::move(ref_ids)}, ref_ids_deleter_default},
reference_names_present_in_header{true}
{}
reference_names_given_on_construction{true}
{
for (auto & id : ref_ids)
{
owned_reference_names.push_back(id);
reference_names.emplace_back(owned_reference_names.back());
}
}
//!\}

//!\brief Defaulted three-way comparison.
auto operator<=>(header const &) const = default;

inline void read(std::string_view header_string);

private:
//!\brief The type of the internal ref_ids pointer. Allows dynamically setting ownership management.
using ref_ids_ptr_t = std::unique_ptr<ref_ids_type, std::function<void(ref_ids_type*)>>;
//!\brief Stream deleter that does nothing (no ownership assumed).
static void ref_ids_deleter_noop(ref_ids_type *) {}
//!\brief Stream deleter with default behaviour (ownership assumed).
static void ref_ids_deleter_default(ref_ids_type * ptr) { delete ptr; }
//!\brief The key's type of ref_dict.
using key_type = std::conditional_t<std::ranges::contiguous_range<std::ranges::range_reference_t<ref_ids_type>>,
std::span<seqan3::range_innermost_value_t<ref_ids_type> const>,
seqan3::type_reduce_t<std::ranges::range_reference_t<ref_ids_type>>>;
//!\brief The pointer to reference ids information (non-owning if reference information is given).
ref_ids_ptr_t ref_ids_ptr{new ref_ids_type{}, ref_ids_deleter_default};
//!\brief Whether reference names are present in the current header.
bool reference_names_present_in_header{false};

//!\brief Custom hash function since std::hash is not defined for all range types (e.g. std::span<char>).
struct key_hasher
{
//!\brief Hash a key.
template <typename key_t>
size_t operator()(key_t && key) const noexcept
{
using char_t = std::ranges::range_value_t<key_t>;
size_t result{0};
std::hash<char_t> h{};
for (char_t character : key)
{
result *= 0x8F3F73B5CF1C9ADE;
result += h(character);
}
return result;
}
};
//!\brief In case no reference information is given on construction, the reference names are stored here.
std::deque<std::string> owned_reference_names;

//!\brief The reference sequence names.
std::vector<std::string_view> reference_names;

std::vector<std::tuple<int32_t, std::string>> reference_names_info{};

//!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
std::unordered_map<std::string_view, int32_t> reference_name_to_pos{};

//!\brief Whether reference sequence names were given to the header on construction.
bool reference_names_given_on_construction{false};

/* [[noreturn]] compiler says this returns something...? */ void warning(auto const &... messages) const
{
// if (print_warnings)
// {
// seqan3::debug_stream << "[B.I.O sam format header warning in line " << line << "] "; // todo line numbers
seqan3::debug_stream << "[B.I.O sam format header warning] ";
(seqan3::debug_stream << ... << messages);
seqan3::debug_stream << std::endl;
// }
}
public:
/*!\name [@HD] File-level meta data
* \brief You can directly edit these member variables.
Expand All @@ -125,9 +125,10 @@ class header
//!\}

/*!\name [@SQ] Reference sequence dictionary
* \brief You can directly edit these member variables.
* \brief You **CANNOT** directly edit these member variables. Please use the respective modifiers.
* \{
*/

/*!\brief [@SQ SN] Reference sequence names
*
* \details
Expand All @@ -146,9 +147,9 @@ class header
* In this case, the reference information is parsed from the records field::ref_id and stored in the header.
* This member function then provides access to the unique list of reference names encountered in the records.
*/
ref_ids_type & ref_ids()
std::vector<std::string_view> const & rnames()
{
return *ref_ids_ptr;
return reference_names;
}

/*!\brief [@SQ LN,AH,AN,AS,M5,SP,UR] Reference sequence auxiliary information
Expand Down Expand Up @@ -182,10 +183,25 @@ class header
* * **UR:** URI of the sequence. This value may start with one of the standard protocols, e.g http: or ftp:. If
* it does not start with one of these protocols, it is assumed to be a file-system path
*/
std::vector<std::tuple<int32_t, std::string>> ref_id_info{};
std::vector<std::tuple<int32_t, std::string>> const & rnames_info()
{
return reference_names_info;
}

//!\brief The mapping of reference name to position in the reference_names range and the rnames_info() range.
std::unordered_map<std::string_view, int32_t> const & rname_to_pos()
{
return reference_name_to_pos;
}

//!\brief The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
std::unordered_map<key_type, int32_t, key_hasher, seqan3::detail::view_equality_fn> ref_dict{};
//!\brief Append a new rname entry with it's length and extra information and update reference_name_to_pos.
void push_back_rname(std::string_view const rname, int32_t const length, std::string_view const extra_info)
{
owned_reference_names.emplace_back(rname);
reference_names.emplace_back(owned_reference_names.back());
reference_names_info.emplace_back(length, extra_info);
reference_name_to_pos[reference_names.back()] = reference_names.size() - 1;
}
//!\}

/*!\name [@RG] Read groups
Expand Down Expand Up @@ -270,8 +286,7 @@ class header
* not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual
* error.
*/
template <typename ref_ids_type>
void header<ref_ids_type>::read(std::string_view header_string)
void header::read(std::string_view header_string)
{
auto stream_view = header_string | seqan3::views::single_pass_input;

Expand Down Expand Up @@ -379,7 +394,7 @@ void header<ref_ids_type>::read(std::string_view header_string)

case make_tag('S', 'Q'): // SQ (sequence dictionary) tag
{
std::ranges::range_value_t<ref_ids_type> id;
std::string id;
std::tuple<int32_t, std::string> info{};
bool sequence_length_was_present{};

Expand Down Expand Up @@ -416,30 +431,30 @@ void header<ref_ids_type>::read(std::string_view header_string)
if (!sequence_length_was_present)
throw format_error{"The required LN tag in @SQ is missing."};

if (reference_names_present_in_header)
{ // If reference information was given, the ids exist and we can fill ref_dict directly.
auto id_it = ref_dict.find(id);
if (!reference_names_given_on_construction)
{ // Reference information was not given by the user but needs to be filled from the header
push_back_rname(id, std::get<0>(info), std::get<1>(info));
}
else
{ // If reference information was given we check them against the once in the header
auto id_it = reference_name_to_pos.find(id);

if (id_it == ref_dict.end())
throw format_error{seqan3::detail::to_string("Unknown reference name '", id, "' found in SAM header ",
"(header.ref_ids(): "/* , ref_ids(), TODO */").")};
if (id_it == reference_name_to_pos.end())
{
warning("The reference sequence name \"", id, "\" was present in the header but not in the "
"user provided rnames.");

auto & given_ref_info = ref_id_info[id_it->second];
push_back_rname(id, std::get<0>(info), std::get<1>(info));
}
else
{
assert(id_it->second < static_cast<decltype(id_it->second)>(reference_names_info.size()));

if (std::get<0>(given_ref_info) != std::get<0>(info))
throw format_error{"Provided and header-based reference length differ."};
if (std::get<0>(reference_names_info[id_it->second]) != std::get<0>(info))
warning("Provided and header-based reference length differ for rname :\"", id, "\".");

ref_id_info[id_it->second] = std::move(info);
}
else
{ // If not, we need to update the ids first and fill the reference dictionary afterwards.
static_assert(!seqan3::detail::is_type_specialisation_of_v<ref_ids_type, std::deque>,
"The range over reference ids must be of type std::deque such that pointers are not "
"invalidated.");

ref_ids().push_back(id);
ref_id_info.push_back(info);
ref_dict[(ref_ids())[(ref_ids()).size() - 1]] = (ref_ids()).size() - 1;
reference_names_info[id_it->second] = info; // update rname information
}
}
break;
}
Expand Down Expand Up @@ -552,4 +567,15 @@ void header<ref_ids_type>::read(std::string_view header_string)
}
}

//!\brief A datastructure that contains private data of sam IO records.
//!\ingroup var_io
struct record_private_data
{
//!\privatesection
//!\brief Pointer to the header
header const * header_ptr = nullptr;

friend bool operator==(record_private_data const &, record_private_data const &) = default; //!< Defaulted.
};

} // namespace bio::map_io
Loading

0 comments on commit df0dfdc

Please sign in to comment.