SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/bit>
16 #include <seqan3/std/concepts>
17 #include <iterator>
18 #include <seqan3/std/ranges>
19 #include <string>
20 #include <vector>
21 
48 
49 namespace seqan3
50 {
51 
63 {
64 public:
68  // string_buffer is of type std::string and has some problems with pre-C++11 ABI
69  format_bam() = default;
70  format_bam(format_bam const &) = default;
71  format_bam & operator=(format_bam const &) = default;
72  format_bam(format_bam &&) = default;
73  format_bam & operator=(format_bam &&) = default;
74  ~format_bam() = default;
75 
77 
80  {
81  { "bam" }
82  };
83 
84 protected:
85  template <typename stream_type, // constraints checked by file
86  typename seq_legal_alph_type,
87  typename ref_seqs_type,
88  typename ref_ids_type,
89  typename seq_type,
90  typename id_type,
91  typename offset_type,
92  typename ref_seq_type,
93  typename ref_id_type,
94  typename ref_offset_type,
95  typename align_type,
96  typename cigar_type,
97  typename flag_type,
98  typename mapq_type,
99  typename qual_type,
100  typename mate_type,
101  typename tag_dict_type,
102  typename e_value_type,
103  typename bit_score_type>
104  void read_alignment_record(stream_type & stream,
105  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
106  ref_seqs_type & ref_seqs,
108  seq_type & seq,
109  qual_type & qual,
110  id_type & id,
111  offset_type & offset,
112  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
113  ref_id_type & ref_id,
114  ref_offset_type & ref_offset,
115  align_type & align,
116  cigar_type & cigar_vector,
117  flag_type & flag,
118  mapq_type & mapq,
119  mate_type & mate,
120  tag_dict_type & tag_dict,
121  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
122  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
123 
124  template <typename stream_type,
125  typename header_type,
126  typename seq_type,
127  typename id_type,
128  typename ref_seq_type,
129  typename ref_id_type,
130  typename align_type,
131  typename cigar_type,
132  typename qual_type,
133  typename mate_type,
134  typename tag_dict_type>
135  void write_alignment_record([[maybe_unused]] stream_type & stream,
136  [[maybe_unused]] sam_file_output_options const & options,
137  [[maybe_unused]] header_type && header,
138  [[maybe_unused]] seq_type && seq,
139  [[maybe_unused]] qual_type && qual,
140  [[maybe_unused]] id_type && id,
141  [[maybe_unused]] int32_t const offset,
142  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
143  [[maybe_unused]] ref_id_type && ref_id,
144  [[maybe_unused]] std::optional<int32_t> ref_offset,
145  [[maybe_unused]] align_type && align,
146  [[maybe_unused]] cigar_type && cigar_vector,
147  [[maybe_unused]] sam_flag const flag,
148  [[maybe_unused]] uint8_t const mapq,
149  [[maybe_unused]] mate_type && mate,
150  [[maybe_unused]] tag_dict_type && tag_dict,
151  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
152  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
153 
154 private:
156  bool header_was_read{false};
157 
160 
163  { // naming corresponds to official SAM/BAM specifications
164  int32_t block_size;
165  int32_t refID;
166  int32_t pos;
167  uint32_t l_read_name:8;
168  uint32_t mapq:8;
169  uint32_t bin:16;
170  uint32_t n_cigar_op:16;
172  int32_t l_seq;
173  int32_t next_refID;
174  int32_t next_pos;
175  int32_t tlen;
176  };
177 
180  {
181  [] () constexpr
182  {
184 
185  using index_t = std::make_unsigned_t<char>;
186 
187  // ret['M'] = 0; set anyway by initialization
188  ret[static_cast<index_t>('I')] = 1;
189  ret[static_cast<index_t>('D')] = 2;
190  ret[static_cast<index_t>('N')] = 3;
191  ret[static_cast<index_t>('S')] = 4;
192  ret[static_cast<index_t>('H')] = 5;
193  ret[static_cast<index_t>('P')] = 6;
194  ret[static_cast<index_t>('=')] = 7;
195  ret[static_cast<index_t>('X')] = 8;
196 
197  return ret;
198  }()
199  };
200 
202  static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
203  {
204  --end;
205  if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
206  if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
207  if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
208  if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
209  if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
210  return 0;
211  }
212 
213  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
214 
221  template <typename stream_view_type, std::integral number_type>
222  void read_field(stream_view_type && stream_view, number_type & target)
223  {
224  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
225  }
226 
232  template <typename stream_view_type>
233  void read_field(stream_view_type && stream_view, float & target)
234  {
235  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
236  }
237 
248  template <typename stream_view_type, typename optional_value_type>
249  void read_field(stream_view_type && stream_view, std::optional<optional_value_type> & target)
250  {
251  optional_value_type tmp;
252  read_field(std::forward<stream_view_type>(stream_view), tmp);
253  target = tmp;
254  }
255 
256  template <typename stream_view_type, typename value_type>
258  stream_view_type && stream_view,
259  value_type const & SEQAN3_DOXYGEN_ONLY(value));
260 
261  template <typename stream_view_type>
262  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
263 
264  template <typename cigar_input_type>
265  auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
266 
267  static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
268 };
269 
271 template <typename stream_type, // constraints checked by file
272  typename seq_legal_alph_type,
273  typename ref_seqs_type,
274  typename ref_ids_type,
275  typename seq_type,
276  typename id_type,
277  typename offset_type,
278  typename ref_seq_type,
279  typename ref_id_type,
280  typename ref_offset_type,
281  typename align_type,
282  typename cigar_type,
283  typename flag_type,
284  typename mapq_type,
285  typename qual_type,
286  typename mate_type,
287  typename tag_dict_type,
288  typename e_value_type,
289  typename bit_score_type>
290 inline void format_bam::read_alignment_record(stream_type & stream,
291  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
292  ref_seqs_type & ref_seqs,
294  seq_type & seq,
295  qual_type & qual,
296  id_type & id,
297  offset_type & offset,
298  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
299  ref_id_type & ref_id,
300  ref_offset_type & ref_offset,
301  align_type & align,
302  cigar_type & cigar_vector,
303  flag_type & flag,
304  mapq_type & mapq,
305  mate_type & mate,
306  tag_dict_type & tag_dict,
307  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
308  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
309 {
310  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
311  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
312  "The ref_offset must be a specialisation of std::optional.");
313 
314  static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
315  "The type of field::mapq must be uint8_t.");
316 
317  static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
318  "The type of field::flag must be seqan3::sam_flag.");
319 
320  auto stream_view = seqan3::views::istreambuf(stream);
321 
322  // these variables need to be stored to compute the ALIGNMENT
323  [[maybe_unused]] int32_t offset_tmp{};
324  [[maybe_unused]] int32_t soft_clipping_end{};
325  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
326  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
327 
328  // Header
329  // -------------------------------------------------------------------------------------------------------------
330  if (!header_was_read)
331  {
332  // magic BAM string
333  if (!std::ranges::equal(stream_view | views::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
334  throw format_error{"File is not in BAM format."};
335 
336  int32_t tmp32{};
337  read_field(stream_view, tmp32);
338 
339  if (tmp32 > 0) // header text is present
340  read_header(stream_view | views::take_exactly_or_throw(tmp32), header, ref_seqs);
341 
342  int32_t n_ref;
343  read_field(stream_view, n_ref);
344 
345  for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
346  {
347  read_field(stream_view, tmp32); // l_name (length of reference name including \0 character)
348 
349  string_buffer.resize(tmp32 - 1);
350  std::ranges::copy_n(std::ranges::begin(stream_view), tmp32 - 1, string_buffer.data()); // copy without \0 character
351  std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
352 
353  read_field(stream_view, tmp32); // l_ref (length of reference sequence)
354 
355  auto id_it = header.ref_dict.find(string_buffer);
356 
357  // sanity checks of reference information to existing header object:
358  if (id_it == header.ref_dict.end()) // [unlikely]
359  {
360  throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
361  "' found in BAM file header (header.ref_ids():",
362  header.ref_ids(), ").")};
363  }
364  else if (id_it->second != ref_idx) // [unlikely]
365  {
366  throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
367  " does not correspond to the position ", id_it->second,
368  " in the header (header.ref_ids():", header.ref_ids(), ").")};
369  }
370  else if (std::get<0>(header.ref_id_info[id_it->second]) != tmp32) // [unlikely]
371  {
372  throw format_error{"Provided reference has unequal length as specified in the header."};
373  }
374  }
375 
376  header_was_read = true;
377 
378  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
379  return;
380  }
381 
382  // read alignment record into buffer
383  // -------------------------------------------------------------------------------------------------------------
385  std::ranges::copy(stream_view | views::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
386 
387  if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
388  {
389  throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
390  "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
391  }
392  else if (core.refID > -1) // not unmapped
393  {
394  ref_id = core.refID; // field::ref_id
395  }
396 
397  flag = core.flag; // field::flag
398  mapq = core.mapq; // field::mapq
399 
400  if (core.pos > -1) // [[likely]]
401  ref_offset = core.pos; // field::ref_offset
402 
403  if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
404  {
405  if (core.next_refID > -1)
406  get<0>(mate) = core.next_refID;
407 
408  if (core.next_pos > -1) // [[likely]]
409  get<1>(mate) = core.next_pos;
410 
411  get<2>(mate) = core.tlen;
412  }
413 
414  // read id
415  // -------------------------------------------------------------------------------------------------------------
416  read_field(stream_view | views::take_exactly_or_throw(core.l_read_name - 1), id); // field::id
417  std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
418 
419  // read cigar string
420  // -------------------------------------------------------------------------------------------------------------
421  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
422  {
423  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
424  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
425  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
426  }
427  else
428  {
430  }
431 
432  offset = offset_tmp;
433 
434  // read sequence
435  // -------------------------------------------------------------------------------------------------------------
436  if (core.l_seq > 0) // sequence information is given
437  {
438  auto seq_stream = stream_view
439  | views::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
441  {
442  return {sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
443  sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
444  });
445 
446  if constexpr (detail::decays_to_ignore_v<seq_type>)
447  {
448  auto skip_sequence_bytes = [&] ()
449  {
450  detail::consume(seq_stream);
451  if (core.l_seq & 1)
452  std::ranges::next(std::ranges::begin(stream_view));
453  };
454 
455  if constexpr (!detail::decays_to_ignore_v<align_type>)
456  {
457  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
458  "If you want to read ALIGNMENT but not SEQ, the alignment"
459  " object must store a sequence container at the second (query) position.");
460 
461  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
462  {
463  assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
464  using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
465  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
466 
467  get<1>(align).reserve(seq_length);
468 
469  auto tmp_iter = std::ranges::begin(seq_stream);
470  std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
471 
472  if (offset_tmp & 1)
473  {
474  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
475  ++tmp_iter;
476  --seq_length;
477  }
478 
479  for (size_t i = (seq_length / 2); i > 0; --i)
480  {
481  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
482  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
483  ++tmp_iter;
484  }
485 
486  if (seq_length & 1)
487  {
488  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
489  ++tmp_iter;
490  --soft_clipping_end;
491  }
492 
493  std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
494  }
495  else
496  {
497  skip_sequence_bytes();
498  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
499  }
500  }
501  else
502  {
503  skip_sequence_bytes();
504  }
505  }
506  else
507  {
508  using alph_t = std::ranges::range_value_t<decltype(seq)>;
509  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
510 
511  for (auto [d1, d2] : seq_stream)
512  {
513  seq.push_back(from_dna16[to_rank(d1)]);
514  seq.push_back(from_dna16[to_rank(d2)]);
515  }
516 
517  if (core.l_seq & 1)
518  {
519  sam_dna16 d = sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
520  seq.push_back(from_dna16[to_rank(d)]);
521  std::ranges::next(std::ranges::begin(stream_view));
522  }
523 
524  if constexpr (!detail::decays_to_ignore_v<align_type>)
525  {
526  assign_unaligned(get<1>(align),
527  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
528  std::ranges::distance(seq) - soft_clipping_end));
529  }
530  }
531  }
532 
533  // read qual string
534  // -------------------------------------------------------------------------------------------------------------
535  read_field(stream_view | views::take_exactly_or_throw(core.l_seq)
536  | std::views::transform([] (char chr) { return static_cast<char>(chr + 33); }), qual);
537 
538  // All remaining optional fields if any: SAM tags dictionary
539  // -------------------------------------------------------------------------------------------------------------
540  int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
541  core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
542  assert(remaining_bytes >= 0);
543  auto tags_view = stream_view | views::take_exactly_or_throw(remaining_bytes);
544 
545  while (tags_view.size() > 0)
546  read_field(tags_view, tag_dict);
547 
548  // DONE READING - wrap up
549  // -------------------------------------------------------------------------------------------------------------
550  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
551  {
552  // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
553  // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
554  // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
555  if (core.l_seq != 0 && offset_tmp == core.l_seq)
556  {
557  if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
558  { // maybe only throw in debug mode and otherwise return an empty alignment?
559  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
560  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
561  "stored in the optional field CG. You need to read in the field::tags and "
562  "field::seq in order to access this information.")};
563  }
564  else
565  {
566  auto it = tag_dict.find("CG"_tag);
567 
568  if (it == tag_dict.end())
569  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
570  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
571  "stored in the optional field CG but this tag is not present in the given ",
572  "record.")};
573 
574  auto cigar_view = std::views::all(std::get<std::string>(it->second));
575  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(cigar_view);
576  offset_tmp = soft_clipping_end = 0;
577  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
578  tag_dict.erase(it); // remove redundant information
579 
580  if constexpr (!detail::decays_to_ignore_v<align_type>)
581  {
582  assign_unaligned(get<1>(align),
583  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
584  std::ranges::distance(seq) - soft_clipping_end));
585  }
586  }
587  }
588  }
589 
590  // Alignment object construction
591  if constexpr (!detail::decays_to_ignore_v<align_type>)
592  construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
593 
594  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
595  std::swap(cigar_vector, tmp_cigar_vector);
596 }
597 
599 template <typename stream_type,
600  typename header_type,
601  typename seq_type,
602  typename id_type,
603  typename ref_seq_type,
604  typename ref_id_type,
605  typename align_type,
606  typename cigar_type,
607  typename qual_type,
608  typename mate_type,
609  typename tag_dict_type>
610 inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
611  [[maybe_unused]] sam_file_output_options const & options,
612  [[maybe_unused]] header_type && header,
613  [[maybe_unused]] seq_type && seq,
614  [[maybe_unused]] qual_type && qual,
615  [[maybe_unused]] id_type && id,
616  [[maybe_unused]] int32_t const offset,
617  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
618  [[maybe_unused]] ref_id_type && ref_id,
619  [[maybe_unused]] std::optional<int32_t> ref_offset,
620  [[maybe_unused]] align_type && align,
621  [[maybe_unused]] cigar_type && cigar_vector,
622  [[maybe_unused]] sam_flag const flag,
623  [[maybe_unused]] uint8_t const mapq,
624  [[maybe_unused]] mate_type && mate,
625  [[maybe_unused]] tag_dict_type && tag_dict,
626  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
627  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
628 {
629  // ---------------------------------------------------------------------
630  // Type Requirements (as static asserts for user friendliness)
631  // ---------------------------------------------------------------------
632  static_assert((std::ranges::forward_range<seq_type> &&
633  alphabet<std::ranges::range_reference_t<seq_type>>),
634  "The seq object must be a std::ranges::forward_range over "
635  "letters that model seqan3::alphabet.");
636 
637  static_assert((std::ranges::forward_range<id_type> &&
638  alphabet<std::ranges::range_reference_t<id_type>>),
639  "The id object must be a std::ranges::forward_range over "
640  "letters that model seqan3::alphabet.");
641 
642  static_assert((std::ranges::forward_range<ref_seq_type> &&
643  alphabet<std::ranges::range_reference_t<ref_seq_type>>),
644  "The ref_seq object must be a std::ranges::forward_range "
645  "over letters that model seqan3::alphabet.");
646 
647  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
648  {
649  static_assert((std::ranges::forward_range<ref_id_type> ||
650  std::integral<std::remove_reference_t<ref_id_type>> ||
651  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
652  "The ref_id object must be a std::ranges::forward_range "
653  "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
654  }
655 
657  "The align object must be a std::pair of two ranges whose "
658  "value_type is comparable to seqan3::gap");
659 
660  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
661  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
662  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
663  "The align object must be a std::pair of two ranges whose "
664  "value_type is comparable to seqan3::gap");
665 
666  static_assert((std::ranges::forward_range<qual_type> &&
667  alphabet<std::ranges::range_reference_t<qual_type>>),
668  "The qual object must be a std::ranges::forward_range "
669  "over letters that model seqan3::alphabet.");
670 
672  "The mate object must be a std::tuple of size 3 with "
673  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
674  "2) a std::integral or std::optional<std::integral>, and "
675  "3) a std::integral.");
676 
677  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
678  std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
679  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
680  (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
681  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
682  std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
683  "The mate object must be a std::tuple of size 3 with "
684  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
685  "2) a std::integral or std::optional<std::integral>, and "
686  "3) a std::integral.");
687 
688  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
689  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
690 
691  if constexpr (detail::decays_to_ignore_v<header_type>)
692  {
693  throw format_error{"BAM can only be written with a header but you did not provide enough information! "
694  "You can either construct the output file with ref_ids and ref_seqs information and "
695  "the header will be created for you, or you can access the `header` member directly."};
696  }
697  else
698  {
699  // ---------------------------------------------------------------------
700  // logical Requirements
701  // ---------------------------------------------------------------------
702 
703  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
704  throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
705 
706  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
707 
708  // ---------------------------------------------------------------------
709  // Writing the BAM Header on first call
710  // ---------------------------------------------------------------------
711  if (!header_was_written)
712  {
713  stream << "BAM\1";
715  write_header(os, options, header); // write SAM header to temporary stream to query the size.
716  int32_t l_text{static_cast<int32_t>(os.str().size())};
717  std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
718 
719  stream << os.str();
720 
721  int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
722  std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
723 
724  for (int32_t ridx = 0; ridx < n_ref; ++ridx)
725  {
726  int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
727  std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
728  // write reference name:
729  std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
730  stream_it = '\0';
731  // write reference sequence length:
732  std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
733  }
734 
735  header_was_written = true;
736  }
737 
738  // ---------------------------------------------------------------------
739  // Writing the Record
740  // ---------------------------------------------------------------------
741  int32_t ref_length{};
742 
743  // if alignment is non-empty, replace cigar_vector.
744  // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
745  if (!std::ranges::empty(cigar_vector))
746  {
747  int32_t dummy_seq_length{};
748  for (auto & [count, operation] : cigar_vector)
749  update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
750  }
751  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
752  {
753  ref_length = std::ranges::distance(get<1>(align));
754 
755  // compute possible distance from alignment end to sequence end
756  // which indicates soft clipping at the end.
757  // This should be replaced by a free count_gaps function for
758  // aligned sequences which is more efficient if possible.
759  int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
760 
761  for (auto chr : get<1>(align))
762  if (chr == gap{})
763  ++off_end;
764 
765  off_end -= ref_length;
766  cigar_vector = detail::get_cigar_vector(align, offset, off_end);
767  }
768 
769  if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
770  {
771  tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
772  cigar_vector.resize(2);
773  cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
774  cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
775  }
776 
777  std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
778 
779  // Compute the value for the l_read_name field for the bam record.
780  // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
781  // the data type to store the value is uint8_t and 255 is the maximal size.
782  // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
783  // 2 bytes.
784  uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
785  read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
786 
788  {
789  /* block_size */ 0, // will be initialised right after
790  /* refID */ -1, // will be initialised right after
791  /* pos */ ref_offset.value_or(-1),
792  /* l_read_name */ read_name_size,
793  /* mapq */ mapq,
794  /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
795  /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
796  /* flag */ flag,
797  /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
798  /* next_refId */ -1, // will be initialised right after
799  /* next_pos */ get<1>(mate).value_or(-1),
800  /* tlen */ get<2>(mate)
801  };
802 
803  auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
804  [[maybe_unused]] auto & id_target)
805  {
806  using id_t = std::remove_reference_t<decltype(id_source)>;
807 
808  if constexpr (!detail::decays_to_ignore_v<id_t>)
809  {
810  if constexpr (std::integral<id_t>)
811  {
812  id_target = id_source;
813  }
814  else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
815  {
816  id_target = id_source.value_or(-1);
817  }
818  else
819  {
820  if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
821  {
822  auto id_it = header.ref_dict.end();
823 
824  if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
825  std::ranges::sized_range<decltype(id_source)> &&
826  std::ranges::borrowed_range<decltype(id_source)>)
827  {
828  id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
829  std::ranges::size(id_source)});
830  }
831  else
832  {
833  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
834 
835  static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
836  "The ref_id type is not convertible to the reference id information stored in the "
837  "reference dictionary of the header object.");
838 
839  id_it = header.ref_dict.find(id_source);
840  }
841 
842  if (id_it == header.ref_dict.end())
843  {
844  throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
845  "not be found in BAM header ref_dict: ",
846  header.ref_dict, ".")};
847  }
848 
849  id_target = id_it->second;
850  }
851  }
852  }
853  };
854 
855  // initialise core.refID
856  check_and_assign_id_to(ref_id, core.refID);
857 
858  // initialise core.next_refID
859  check_and_assign_id_to(get<0>(mate), core.next_refID);
860 
861  // initialise core.block_size
862  core.block_size = sizeof(core) - 4/*block_size excluded*/ +
863  core.l_read_name +
864  core.n_cigar_op * 4 + // each int32_t has 4 bytes
865  (core.l_seq + 1) / 2 + // bitcompressed seq
866  core.l_seq + // quality string
867  tag_dict_binary_str.size();
868 
869  std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
870 
871  if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
872  stream_it = '*';
873  else
874  std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
875  stream_it = '\0';
876 
877  // write cigar
878  for (auto [cigar_count, op] : cigar_vector)
879  {
880  cigar_count = cigar_count << 4;
881  cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
882  std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
883  }
884 
885  // write seq (bit-compressed: sam_dna16 characters go into one byte)
886  using alph_t = std::ranges::range_value_t<seq_type>;
887  constexpr auto to_dna16 = detail::convert_through_char_representation<sam_dna16, alph_t>;
888 
889  auto sit = std::ranges::begin(seq);
890  for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
891  {
892  uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
893  ++sidx, ++sit;
894  compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
895  stream_it = static_cast<char>(compressed_chr);
896  }
897 
898  if (core.l_seq & 1) // write one more
899  stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
900 
901  // write qual
902  if (std::ranges::empty(qual))
903  {
904  auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
905  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
906  }
907  else
908  {
909  if (std::ranges::distance(qual) != core.l_seq)
910  throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
911  core.l_seq, ". Got quality with size ",
912  std::ranges::distance(qual), " instead.")};
913 
914  auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
915  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
916  }
917 
918  // write optional fields
919  stream << tag_dict_binary_str;
920  } // if constexpr (!detail::decays_to_ignore_v<header_type>)
921 }
922 
924 template <typename stream_view_type, typename value_type>
926  stream_view_type && stream_view,
927  value_type const & SEQAN3_DOXYGEN_ONLY(value))
928 {
929  int32_t count;
930  read_field(stream_view, count); // read length of vector
931  std::vector<value_type> tmp_vector;
932  tmp_vector.reserve(count);
933 
934  while (count > 0)
935  {
936  value_type tmp{};
937  read_field(stream_view, tmp);
938  tmp_vector.push_back(std::move(tmp));
939  --count;
940  }
941  variant = std::move(tmp_vector);
942 }
943 
961 template <typename stream_view_type>
962 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
963 {
964  /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
965  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
966  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
967  VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
968  by the length (int32_t) of the array, followed by the values.
969  */
970  auto it = std::ranges::begin(stream_view);
971  uint16_t tag = static_cast<uint16_t>(*it) << 8;
972  ++it; // skip char read before
973 
974  tag += static_cast<uint16_t>(*it);
975  ++it; // skip char read before
976 
977  char type_id = *it;
978  ++it; // skip char read before
979 
980  switch (type_id)
981  {
982  case 'A' : // char
983  {
984  target[tag] = *it;
985  ++it; // skip char that has been read
986  break;
987  }
988  // all integer sizes are possible
989  case 'c' : // int8_t
990  {
991  int8_t tmp;
992  read_field(stream_view, tmp);
993  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
994  break;
995  }
996  case 'C' : // uint8_t
997  {
998  uint8_t tmp;
999  read_field(stream_view, tmp);
1000  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1001  break;
1002  }
1003  case 's' : // int16_t
1004  {
1005  int16_t tmp;
1006  read_field(stream_view, tmp);
1007  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1008  break;
1009  }
1010  case 'S' : // uint16_t
1011  {
1012  uint16_t tmp;
1013  read_field(stream_view, tmp);
1014  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1015  break;
1016  }
1017  case 'i' : // int32_t
1018  {
1019  int32_t tmp;
1020  read_field(stream_view, tmp);
1021  target[tag] = std::move(tmp); // readable sam format only allows int32_t
1022  break;
1023  }
1024  case 'I' : // uint32_t
1025  {
1026  uint32_t tmp;
1027  read_field(stream_view, tmp);
1028  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1029  break;
1030  }
1031  case 'f' : // float
1032  {
1033  float tmp;
1034  read_field(stream_view, tmp);
1035  target[tag] = tmp;
1036  break;
1037  }
1038  case 'Z' : // string
1039  {
1040  string_buffer.clear();
1041  while (!is_char<'\0'>(*it))
1042  {
1043  string_buffer.push_back(*it);
1044  ++it;
1045  }
1046  ++it; // skip \0
1047  target[tag] = string_buffer;
1048  break;
1049  }
1050  case 'H' : // byte array, represented as null-terminated string; specification requires even number of bytes
1051  {
1052  std::vector<std::byte> byte_array;
1053  std::byte value;
1054  while (!is_char<'\0'>(*it))
1055  {
1056  string_buffer.clear();
1057  string_buffer.push_back(*it);
1058  ++it;
1059 
1060  if (*it == '\0')
1061  throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1062 
1063  string_buffer.push_back(*it);
1064  ++it;
1065  read_field(string_buffer, value);
1066  byte_array.push_back(value);
1067  }
1068  ++it; // skip \0
1069  target[tag] = byte_array;
1070  break;
1071  }
1072  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1073  {
1074  char array_value_type_id = *it;
1075  ++it; // skip char read before
1076 
1077  switch (array_value_type_id)
1078  {
1079  case 'c' : // int8_t
1080  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1081  break;
1082  case 'C' : // uint8_t
1083  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1084  break;
1085  case 's' : // int16_t
1086  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1087  break;
1088  case 'S' : // uint16_t
1089  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1090  break;
1091  case 'i' : // int32_t
1092  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1093  break;
1094  case 'I' : // uint32_t
1095  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1096  break;
1097  case 'f' : // float
1098  read_sam_dict_vector(target[tag], stream_view, float{});
1099  break;
1100  default:
1101  throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1102  "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1103  }
1104  break;
1105  }
1106  default:
1107  throw format_error{detail::to_string("The second character in the numerical id of a "
1108  "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1109  }
1110 }
1111 
1126 template <typename cigar_input_type>
1127 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1128 {
1129  std::vector<cigar> operations{};
1130  char operation{'\0'};
1131  uint32_t count{};
1132  int32_t ref_length{}, seq_length{};
1133  uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1134  constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1135  constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1136 
1137  if (n_cigar_op == 0) // [[unlikely]]
1138  return std::tuple{operations, ref_length, seq_length};
1139 
1140  // parse the rest of the cigar
1141  // -------------------------------------------------------------------------------------------------------------
1142  while (n_cigar_op > 0) // until stream is not empty
1143  {
1144  std::ranges::copy_n(std::ranges::begin(cigar_input),
1145  sizeof(operation_and_count),
1146  reinterpret_cast<char*>(&operation_and_count));
1147  operation = cigar_mapping[operation_and_count & cigar_mask];
1148  count = operation_and_count >> 4;
1149 
1150  update_alignment_lengths(ref_length, seq_length, operation, count);
1151  operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1152  --n_cigar_op;
1153  }
1154 
1155  return std::tuple{operations, ref_length, seq_length};
1156 }
1157 
1162 {
1163  std::string result{};
1164 
1165  auto stream_variant_fn = [&result] (auto && arg) // helper to print an std::variant
1166  {
1167  // T is either char, int32_t, float, std::string, or a std::vector<some int>
1168  using T = std::remove_cvref_t<decltype(arg)>;
1169 
1170  if constexpr (std::same_as<T, int32_t>)
1171  {
1172  // always choose the smallest possible representation [cCsSiI]
1173  size_t const absolute_arg = std::abs(arg);
1174  auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1175  bool const negative = arg < 0;
1176  n = n * n + 2 * negative; // for switch case order
1177 
1178  switch (n)
1179  {
1180  case 0:
1181  {
1182  result[result.size() - 1] = 'C';
1183  result.append(reinterpret_cast<char const *>(&arg), 1);
1184  break;
1185  }
1186  case 1:
1187  {
1188  result[result.size() - 1] = 'S';
1189  result.append(reinterpret_cast<char const *>(&arg), 2);
1190  break;
1191  }
1192  case 2:
1193  {
1194  result[result.size() - 1] = 'c';
1195  int8_t tmp = static_cast<int8_t>(arg);
1196  result.append(reinterpret_cast<char const *>(&tmp), 1);
1197  break;
1198  }
1199  case 3:
1200  {
1201  result[result.size() - 1] = 's';
1202  int16_t tmp = static_cast<int16_t>(arg);
1203  result.append(reinterpret_cast<char const *>(&tmp), 2);
1204  break;
1205  }
1206  default:
1207  {
1208  result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1209  break;
1210  }
1211  }
1212  }
1213  else if constexpr (std::same_as<T, std::string>)
1214  {
1215  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1216  }
1217  else if constexpr (!std::ranges::range<T>) // char, float
1218  {
1219  result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1220  }
1221  else // std::vector of some arithmetic_type type
1222  {
1223  int32_t sz{static_cast<int32_t>(arg.size())};
1224  result.append(reinterpret_cast<char *>(&sz), 4);
1225  result.append(reinterpret_cast<char const *>(arg.data()),
1226  arg.size() * sizeof(std::ranges::range_value_t<T>));
1227  }
1228  };
1229 
1230  for (auto & [tag, variant] : tag_dict)
1231  {
1232  result.push_back(static_cast<char>(tag / 256));
1233  result.push_back(static_cast<char>(tag % 256));
1234 
1235  result.push_back(detail::sam_tag_type_char[variant.index()]);
1236 
1237  if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1238  result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1239 
1240  std::visit(stream_variant_fn, variant);
1241  }
1242 
1243  return result;
1244 }
1245 
1246 } // namespace seqan3
Provides seqan3::detail::convert_through_char_representation.
Provides the C++20 <bit> header if it is not already available.
constexpr derived_type & assign_char(char_type const c) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:159
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:186
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:59
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:39
The alignment base format.
Definition: format_sam_base.hpp:63
std::tuple< std::vector< cigar >, int32_t, int32_t > parse_cigar(cigar_input_type &&cigar_input) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_sam_base.hpp:264
void write_header(stream_t &stream, sam_file_output_options const &options, sam_file_header< ref_ids_type > &header)
Writes the SAM header.
Definition: format_sam_base.hpp:710
static void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition: format_sam_base.hpp:199
void transfer_soft_clipping_to(std::vector< cigar > const &cigar_vector, int32_t &sc_begin, int32_t &sc_end) const
Transfer soft clipping information from the cigar_vector to sc_begin and sc_end.
Definition: format_sam_base.hpp:219
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition: format_sam_base.hpp:84
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &)
Reads the SAM header.
Definition: format_sam_base.hpp:492
void construct_alignment(align_type &align, std::vector< cigar > &cigar_vector, [[maybe_unused]] int32_t rid, [[maybe_unused]] ref_seqs_type &ref_seqs, [[maybe_unused]] int32_t ref_start, size_t ref_length)
Construct the field::alignment depending on the given information.
Definition: format_sam_base.hpp:304
The actual implementation of seqan3::cigar::operation for documentation purposes only.
Definition: cigar_operation.hpp:48
The BAM format.
Definition: format_bam.hpp:63
format_bam()=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:290
void read_field(stream_view_type &&stream_view, std::optional< optional_value_type > &target)
Delegate parsing of std::optional types to parsing of the inner value type.
Definition: format_bam.hpp:249
void write_alignment_record([[maybe_unused]] stream_type &stream, [[maybe_unused]] sam_file_output_options const &options, [[maybe_unused]] header_type &&header, [[maybe_unused]] seq_type &&seq, [[maybe_unused]] qual_type &&qual, [[maybe_unused]] id_type &&id, [[maybe_unused]] int32_t const offset, [[maybe_unused]] ref_seq_type &&ref_seq, [[maybe_unused]] ref_id_type &&ref_id, [[maybe_unused]] std::optional< int32_t > ref_offset, [[maybe_unused]] align_type &&align, [[maybe_unused]] cigar_type &&cigar_vector, [[maybe_unused]] sam_flag const flag, [[maybe_unused]] uint8_t const mapq, [[maybe_unused]] mate_type &&mate, [[maybe_unused]] tag_dict_type &&tag_dict, [[maybe_unused]] double e_value, [[maybe_unused]] double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:610
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
void read_field(stream_view_type &&stream_view, float &target)
Reads a float field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:233
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
auto parse_binary_cigar(cigar_input_type &&cigar_input, uint16_t n_cigar_op) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_bam.hpp:1127
static std::string get_tag_dict_str(sam_tag_dictionary const &tag_dict)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:1161
std::string string_buffer
Local buffer to read into while avoiding reallocation.
Definition: format_bam.hpp:159
static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
Definition: format_bam.hpp:202
void read_field(stream_view_type &&stream_view, number_type &target)
Reads a arithmetic field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:222
void read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view, value_type const &value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition: format_bam.hpp:925
bool header_was_read
A variable that tracks whether the content of header has been read or not.
Definition: format_bam.hpp:156
static constexpr std::array< uint8_t, 256 > char_to_sam_rank
Converts a cigar op character to the rank according to the official BAM specifications.
Definition: format_bam.hpp:180
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:80
The alphabet of a gap character '-'.
Definition: gap.hpp:37
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: sam_dna16.hpp:46
Stores the header information of alignment files.
Definition: header.hpp:33
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:158
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:155
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:119
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:326
T clear(T... args)
The Concepts library.
Provides various transformation traits used by the range module.
T data(T... args)
Auxiliary for pretty printing of exception messages.
Provides seqan3::debug_stream and related types.
Provides type traits for working with templates.
Provides concepts for core language types and relations that don't have concepts in C++20 (yet).
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:146
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:133
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:199
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:73
std::vector< cigar > get_cigar_vector(alignment_type &&alignment, uint32_t const query_start_pos=0, uint32_t const query_end_pos=0, bool const extended_cigar=false)
Creates a cigar string (SAM format) given a seqan3::detail::pairwise_alignment represented by two seq...
Definition: cigar.hpp:138
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition: cigar.hpp:200
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: misc.hpp:28
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:434
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:168
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:150
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:114
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:91
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:94
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:70
Provides the seqan3::sam_file_header class.
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Provides various utility functions.
Auxiliary functions for the alignment IO.
Provides seqan3::views::istreambuf.
T min(T... args)
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:38
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:36
std::string to_string(value_type &&...values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T push_back(T... args)
Provides various utility functions.
Adaptations of concepts from the Ranges TS.
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_dna16.
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_input_options.
Provides seqan3::sam_file_output_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Stores all fixed length variables which can be read/written directly by reinterpreting the binary str...
Definition: format_bam.hpp:163
uint32_t bin
The bin number.
Definition: format_bam.hpp:169
sam_flag flag
The flag value (uint16_t enum).
Definition: format_bam.hpp:171
uint32_t n_cigar_op
The number of cigar operations of the alignment.
Definition: format_bam.hpp:170
int32_t next_refID
The reference id of the mate.
Definition: format_bam.hpp:173
int32_t l_seq
The number of bases of the read sequence.
Definition: format_bam.hpp:172
int32_t refID
The reference id the read was mapped to.
Definition: format_bam.hpp:165
int32_t pos
The begin position of the alignment.
Definition: format_bam.hpp:166
int32_t block_size
The size in bytes of the whole BAM record.
Definition: format_bam.hpp:164
uint32_t l_read_name
The length of the read name including the \0 character.
Definition: format_bam.hpp:167
int32_t tlen
The template length of the read and its mate.
Definition: format_bam.hpp:175
uint32_t mapq
The mapping quality.
Definition: format_bam.hpp:168
int32_t next_pos
The begin position of the mate alignment.
Definition: format_bam.hpp:174
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:88
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:24
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
Exposes the value_type of another type.
Definition: pre.hpp:58
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
T tuple_size_v
Provides character predicates for tokenisation.
Provides seqan3::tuple_like.
T visit(T... args)