diff --git a/include/iGenVar.hpp b/include/iGenVar.hpp index a3474de7..39000f7c 100644 --- a/include/iGenVar.hpp +++ b/include/iGenVar.hpp @@ -26,6 +26,7 @@ struct cmd_arguments /* -c */ clustering_methods clustering_method{hierarchical_clustering}; // default: hierarchical clustering /* -r */ refinement_methods refinement_method{no_refinement}; // default: no refinement // SV specifications: + /* -e TODO (irallia 19.08.21): duplication_errors */ /* -k */ uint64_t min_var_length = 30; /* -l */ uint64_t max_var_length = 100000; /* -m */ uint64_t max_tol_inserted_length = 50; diff --git a/include/modules/sv_detection_methods/analyze_cigar_method.hpp b/include/modules/sv_detection_methods/analyze_cigar_method.hpp index 7ab08958..79ffb00c 100644 --- a/include/modules/sv_detection_methods/analyze_cigar_method.hpp +++ b/include/modules/sv_detection_methods/analyze_cigar_method.hpp @@ -2,6 +2,96 @@ #include "structures/junction.hpp" // for class Junction +/*! \brief This function checks if the inserted bases are tandem duplicated. + * + * \param[in] config - configuration for a semi-gobal alignment + * \param[in] min_length - minimum length of variants to detect (default 30 bp, + * \param[in] sequence - suffix or prefix sequence + * \param[in] inserted_bases - the inserted bases of the possible duplication + * \param[in] is_suffix - true: suffix case, false: prefix case + * param[out] match_score - if we found a matching duplication, this value represents the maximal + * amount of matches with the reference and thus the length of the existing + * duplicated part on the reference. If there is no duplication, this value + * is 0. + * param[out] length_of_single_dupl_sequence - greatest common divisor of length of inserted sequence and length of + * maching part -> length of a single duplicated sequence + * \returns std::tie(match_score, length_of_single_dupl_sequence) - a tuple of the resulting values + * + * \details In this function the inserted bases are recursively aligned segment by segment until it has been proven that + * it is a real duplication. + * For simplicity, we only consider the suffix case here, since the prefix case works the same way: + * + * suffix_sequence AAAACCGCGTAGCGGGGCGGG + * |||||||||| + * inserted_bases GCGGGGCGGGGCGGG -> unmatched_inserted_bases: GCGGG + * -> match_score = 10 + * -> length_of_single_dupl_sequence = gcd(15, 10) = 5 + * + * suffix_sequence AAAACCGCGTAGCGGGGCGGG + * ||||| + * unmatched_inserted_bases GCGGG + * + * -> match_score = 5, length_of_single_dupl_sequence = gcd(15, 5) = 5 + */ +std::tuple align_suffix_or_prefix(auto const & config, + int32_t const min_length, + std::span & sequence, + std::span & inserted_bases, + bool is_suffix); + +/*! \brief This function checks if the inserted bases are tandem duplicated. + * + * \param[in] query_sequence - SEQ field of the SAM/BAM file + * \param[in] length - length of inserted part, given by the CIGAR + * \param[in] pos_ref - position of the inserted part in the ref (current position) + * \param[in] pos_read - position of the inserted part in the read (current position) + * \param[in] inserted_bases - the inserted bases of the possible duplication + * \param[in] min_length - minimum length of variants to detect (default 30 bp, + * expected to be non-negative) + * \param[in, out] pos_start_dup_seq - start position of the duplicated seq (excluding itself) + * \param[in, out] pos_end_dup_seq - end position of the duplicated seq (including itself) + * \param[in, out] tandem_dup_count - the number of tandem copies of the inserted sequence + * \returns duplicated_bases - the duplicated bases of the duplication + * + * \details If the inserted bases include one or more copies of a duplicated sequence, which is suffix or prefix of + * another copy, we have a Tandem Duplication. To check this, we have to do a semi-global alignment. + * + * Case 1: The duplication (insertion) comes after the matched sequence. Thus we need to check if the inserted + * sequence matches (partly) the suffix sequence. The length of the matching part yields to the amount + * of copies, thus we can calculate the tandem_dup_count and the inserted_bases. + * + * ref AAAACCGCGTAGCGGG----------TACGTAACGGTACG + * |||||||||||||| |||||||| -> inserted sequence: GCGGGGCGGG + * read AACCGCGTAGCGGGGCGGGGCGGGTACGTAAC + * + * suffix_sequence AAAACCGCGTAGCGGG -> free_end_gaps_sequence1_leading{true}, + * ||||| free_end_gaps_sequence1_trailing{false} + * inserted_bases GCGGGGCGGG -> free_end_gaps_sequence2_leading{false}, + * free_end_gaps_sequence2_trailing{true} + * -> tandem_dup_count = 3, duplicated_bases = GCGGG + * + * Case 2: The duplication (insertion) comes before the matched sequence. + * ref AAAACCGCGTA----------GCGGGTACGTAACGGTACG + * ||||||||| ||||||||||||| -> inserted sequence: GCGGGGCGGG + * read AACCGCGTAGCGGGGCGGGGCGGGTACGTAAC + * + * prefix_sequence GCGGGTACGTAACGGTACG -> free_end_gaps_sequence1_leading{false}, + * ||||| free_end_gaps_sequence1_trailing{true} + * inserted_bases GCGGGGCGGG -> free_end_gaps_sequence2_leading{true}, + * free_end_gaps_sequence2_trailing{false} + * -> tandem_dup_count = 3, duplicated_bases = GCGGG + * \see For some complex examples see detection_test.cpp. + */ +std::span detect_tandem_duplication(seqan3::dna5_vector const & query_sequence, + int32_t length, + int32_t pos_ref, + int32_t pos_read, + std::span & inserted_bases, + int32_t const min_length, + int32_t & pos_start_dup_seq, + int32_t & pos_end_dup_seq, + size_t & tandem_dup_count); + /*! \brief This function steps through the CIGAR string and stores junctions with their position in reference and read. * * \param[in] read_name - QNAME field of the SAM/BAM file diff --git a/src/modules/sv_detection_methods/analyze_cigar_method.cpp b/src/modules/sv_detection_methods/analyze_cigar_method.cpp index 368e2384..7d78e99d 100644 --- a/src/modules/sv_detection_methods/analyze_cigar_method.cpp +++ b/src/modules/sv_detection_methods/analyze_cigar_method.cpp @@ -1,3 +1,11 @@ +#include // for std::numeric_limits +#include // for std::gcd (greatest common divisor) + +// #include // for a nicer alignment view +#include +#include +#include +#include #include #include #include @@ -10,6 +18,119 @@ using seqan3::operator""_cigar_operation; using seqan3::operator""_dna5; +std::tuple align_suffix_or_prefix(auto const & config, + int32_t const min_length, + std::span & sequence, + std::span & inserted_bases, + bool is_suffix) +{ + // length of matching part of suffix or prefix sequence + size_t match_score{}; + // The number of tandem copies of this junction. + size_t length_of_single_dupl_sequence = std::numeric_limits::max(); + + auto results = seqan3::align_pairwise(std::tie(sequence, inserted_bases), config); + auto & res = *results.begin(); + // TODO (irallia 17.8.21): The mismatches should give us the opportunity to allow a given amount of errors in the + // duplication. + size_t matches = res.score() % 100; + size_t mismatches = (res.score() - matches) * (-1); + // For the beginning we do not allow mistakes, we should change this later, see todo above. + if (mismatches > 0) + return std::tie(match_score, length_of_single_dupl_sequence); + // found duplicated sequence in front of inserted sequence + if (matches >= min_length) + { + // seqan3::debug_stream << "Resulting alignment:\n" << res.alignment() << '\n'; + match_score = matches; + // The possible length of the single duplicated: greatest common divisor of length of inserted part and length + // of maching part + length_of_single_dupl_sequence = std::gcd(inserted_bases.size(), matches); + if(matches != inserted_bases.size()) + { + std::span unmatched_inserted_bases{}; + if (is_suffix) + unmatched_inserted_bases = inserted_bases | seqan3::views::slice(match_score, inserted_bases.size()); + else + unmatched_inserted_bases = inserted_bases | seqan3::views::slice(0, inserted_bases.size() - match_score); + // The first length_of_single_dupl_sequence is already the greatest common divisor and therefore the next + // recursively calculated one can be ignored. + auto [ next_match_score, + next_length_of_single_dupl_sequence ] = align_suffix_or_prefix(config, + min_length, + sequence, + unmatched_inserted_bases, + is_suffix); + // If the substring does not match, there is no real duplication. + if (next_match_score == 0) { + length_of_single_dupl_sequence = std::numeric_limits::max(); + return std::tie(next_match_score, length_of_single_dupl_sequence); + } + } + } + // The first match_score calculated is automatically the maximum, so the recursively next one can be ignored. + return std::tie(match_score, length_of_single_dupl_sequence); +} + +std::span detect_tandem_duplication(seqan3::dna5_vector const & query_sequence, + int32_t length, + int32_t pos_ref, + int32_t pos_read, + std::span & inserted_bases, + int32_t const min_length, + int32_t & pos_start_dup_seq, + int32_t & pos_end_dup_seq, + size_t & tandem_dup_count) +{ + auto scoring_scheme = seqan3::align_cfg::scoring_scheme{ + seqan3::nucleotide_scoring_scheme{seqan3::match_score{1}, + seqan3::mismatch_score{-100}}}; + + auto gap_scheme = seqan3::align_cfg::gap_cost_affine{seqan3::align_cfg::open_score{0}, + seqan3::align_cfg::extension_score{-100}}; + // Suffix Case: + auto suffix_config = seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{true}, + seqan3::align_cfg::free_end_gaps_sequence2_leading{false}, + seqan3::align_cfg::free_end_gaps_sequence1_trailing{false}, + seqan3::align_cfg::free_end_gaps_sequence2_trailing{true}} | + scoring_scheme | gap_scheme; + + std::span suffix_sequence = query_sequence | seqan3::views::slice(0, pos_read); + auto [ suffix_sequence_match_score, length_of_single_dupl_sequence_1 ] = align_suffix_or_prefix(suffix_config, + min_length, + suffix_sequence, + inserted_bases, + true); + // Prefix Case: + auto prefix_config = seqan3::align_cfg::method_global{seqan3::align_cfg::free_end_gaps_sequence1_leading{false}, + seqan3::align_cfg::free_end_gaps_sequence2_leading{true}, + seqan3::align_cfg::free_end_gaps_sequence1_trailing{true}, + seqan3::align_cfg::free_end_gaps_sequence2_trailing{false}} | + scoring_scheme | gap_scheme; + + std::span prefix_sequence = query_sequence | seqan3::views::slice(pos_read + length, + query_sequence.size()); + auto [ prefix_sequence_match_score, length_of_single_dupl_sequence_2 ] = align_suffix_or_prefix(prefix_config, + min_length, + prefix_sequence, + inserted_bases, + false); + + pos_start_dup_seq = pos_ref - 1 - suffix_sequence_match_score; + pos_end_dup_seq = pos_ref - 1 + prefix_sequence_match_score; + + int16_t length_of_single_dupl_sequence = std::min(length_of_single_dupl_sequence_1, + length_of_single_dupl_sequence_2); + tandem_dup_count = suffix_sequence_match_score / length_of_single_dupl_sequence // duplicated part on ref + + inserted_bases.size() / length_of_single_dupl_sequence // inserted duplications + + prefix_sequence_match_score / length_of_single_dupl_sequence; // duplicated part on ref + // If its a duplication instead of an insertion, we save the (possible multiple times) duplicated part as duplicated_bases + std::span duplicated_bases{}; + if (tandem_dup_count > 0) + duplicated_bases = (inserted_bases | seqan3::views::slice(0, length_of_single_dupl_sequence)); + return duplicated_bases; +} + void analyze_cigar(std::string const & read_name, std::string const & chromosome, int32_t const query_start_pos, @@ -27,30 +148,71 @@ void analyze_cigar(std::string const & read_name, using seqan3::get; int32_t length = get<0>(pair); seqan3::cigar::operation operation = get<1>(pair); - size_t tandem_dup_count = 0; if (operation == 'M'_cigar_operation || operation == '='_cigar_operation || operation == 'X'_cigar_operation) { pos_ref += length; pos_read += length; } + // ------------------------------------- else if (operation == 'I'_cigar_operation) // I: Insertion (gap in the reference sequence) { if (length >= min_length) { // Insertions cause one junction from the insertion location to the next base - auto inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length); - Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward}, - Breakend{chromosome, pos_ref, strand::forward}, - inserted_bases, - tandem_dup_count, - read_name}; - if (gVerbose) - seqan3::debug_stream << "INS: " << new_junction << "\n"; - junctions.push_back(std::move(new_junction)); + std::span inserted_bases = query_sequence | seqan3::views::slice(pos_read, + pos_read + length); + // ---------------- DUP ---------------- + + // Case - Duplication: The inserted bases include one or more copies of a duplicated sequence, with an + // origin somewhere else. -> global alignment + // ##ALT= + // TODO (23.7.21, irallia): This is Part of the Epic #144. Do we need a reference sequence for this? + // ---------------- DUP:TANDEM ---------------- + size_t tandem_dup_count = 0; + int32_t pos_start_dup_seq{}; + int32_t pos_end_dup_seq{}; + std::span duplicated_bases = detect_tandem_duplication(query_sequence, + length, + pos_ref, + pos_read, + inserted_bases, + min_length, + pos_start_dup_seq, + pos_end_dup_seq, + tandem_dup_count); + if (tandem_dup_count != 0) + { + Junction new_junction{Breakend{chromosome, pos_start_dup_seq, strand::forward}, + Breakend{chromosome, pos_end_dup_seq, strand::forward}, + duplicated_bases, + tandem_dup_count, + read_name}; + if (gVerbose) + { + seqan3::debug_stream << "DUP:TANDEM: " << new_junction << "\n"; + seqan3::debug_stream << "\t\t\tduplicated sequence: " << duplicated_bases + << " with " << tandem_dup_count << " duplications\n"; + } + junctions.push_back(std::move(new_junction)); + } else { + // ---------------- INS ---------------- + Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward}, + Breakend{chromosome, pos_ref, strand::forward}, + inserted_bases, + 0, + read_name}; + if (gVerbose) + { + seqan3::debug_stream << "INS: " << new_junction << "\n"; + seqan3::debug_stream << "\t\t\tinserted sequence: " << inserted_bases << "\n"; + } + junctions.push_back(std::move(new_junction)); + } } pos_read += length; } - else if (operation == 'D'_cigar_operation) + // ---------------- DEL ---------------- + else if (operation == 'D'_cigar_operation) // D: Deletion (gap in the read) { if (length >= min_length) { @@ -58,7 +220,7 @@ void analyze_cigar(std::string const & read_name, Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward}, Breakend{chromosome, pos_ref + length, strand::forward}, ""_dna5, - tandem_dup_count, + 0, read_name}; if (gVerbose) seqan3::debug_stream << "DEL: " << new_junction << "\n"; diff --git a/src/variant_detection/variant_output.cpp b/src/variant_detection/variant_output.cpp index 4d9ae2ae..0dbf5d31 100644 --- a/src/variant_detection/variant_output.cpp +++ b/src/variant_detection/variant_output.cpp @@ -32,6 +32,10 @@ void find_and_output_variants(std::map & references_length { int distance = mate2.position - mate1.position - 1; int sv_length = insert_size - distance; + // Distinguish SVLEN calculation for TANDEM:DUP + if (clusters[i].get_common_tandem_dup_count() > 0) + sv_length = distance + 1; + // The SVLEN is neither too short nor too long than specified by the user. if (std::abs(sv_length) >= args.min_var_length && std::abs(sv_length) <= args.max_var_length) @@ -46,7 +50,7 @@ void find_and_output_variants(std::map & references_length tmp.add_info("SVTYPE", "DUP"); // Increment position by 1 because VCF is 1-based tmp.set_pos(mate1.position + 1); - tmp.add_info("SVLEN", std::to_string(distance)); + tmp.add_info("SVLEN", std::to_string(sv_length)); // Increment end by 1 because VCF is 1-based tmp.add_info("END", std::to_string(mate2.position + 1)); tmp.print(out_stream); diff --git a/test/api/detection_test.cpp b/test/api/detection_test.cpp index 459c76e6..4a1b612f 100644 --- a/test/api/detection_test.cpp +++ b/test/api/detection_test.cpp @@ -90,6 +90,155 @@ TEST(junction_detection, cigar_string_del_padding) } } +TEST(junction_detection, cigar_string_dup_detect_tandem_duplication) +{ + seqan3::dna5_vector const & query_sequence = {"AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC"_dna5}; + int32_t length = 15; + int32_t pos_start_dup_seq = 0; + int32_t pos_end_dup_seq = 0; + size_t tandem_dup_count = 0; + + // Prefix Case: The duplication (insertion) comes after the matched sequence. + // (10, 20] -> pos_start_dup_seq, pos_end_dup_seq + // ref AAAACCGCGTAGCGGGGCGGG---------------TACGTAACGGTACG + // ||||||||||||||||||| |||||||| + // read AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC -> inserted sequence: GCGGGGCGGGGCGGG + // + // suffix_sequence AAAACCGCGTAGCGGGGCGGG + // |||||||||| + // inserted sequence: GCGGGGCGGGGCGGG + // -> tandem_dup_count = 5, inserted_bases = GCGGG + int32_t pos_ref = 21; + int32_t pos_read = 19; + std::span inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length); // "GCGGGGCGGG"_dna5 + + auto duplicated_bases = detect_tandem_duplication(query_sequence, + length, + pos_ref, + pos_read, + inserted_bases, + 4, // minimal length of duplication + pos_start_dup_seq, + pos_end_dup_seq, + tandem_dup_count); + EXPECT_EQ(tandem_dup_count, 5); + EXPECT_EQ(pos_start_dup_seq, 10); + EXPECT_EQ(pos_end_dup_seq, 20); + + // Suffix Case: The duplication (insertion) comes before the matched sequence. + // (10, 20] -> pos_start_dup_seq, pos_end_dup_seq + // ref AAAACCGCGTA---------------GCGGGGCGGGTACGTAACGGTACG + // ||||||||| |||||||||||||||||| + // read AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC -> inserted sequence: GCGGGGCGGGGCGGG + // + // prefix_sequence GCGGGGCGGGTACGTAACGGTACG + // |||||||||| + // inserted sequence: GCGGGGCGGGGCGGG + // -> tandem_dup_count = 5, inserted_bases = GCGGG + pos_ref = 11; // position of first inserted base. + pos_read = 9; + inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length); + duplicated_bases = detect_tandem_duplication(query_sequence, + length, + pos_ref, + pos_read, + inserted_bases, + 4, // minimal length of duplication + pos_start_dup_seq, + pos_end_dup_seq, + tandem_dup_count); + EXPECT_EQ(tandem_dup_count, 5); + EXPECT_EQ(pos_start_dup_seq, 10); + EXPECT_EQ(pos_end_dup_seq, 20); + + // Special Case: The duplication (insertion) is in between of matching sequences. + // (10, 25] -> pos_start_dup_seq, pos_end_dup_seq + // ref AAAACCGCGTAGCGGG----------GCGGGGCGGGTACGTAACGGTACG + // |||||||||||||| ||||||||||||| + // read AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCGGGTACGTAAC -> inserted sequence: GCGGGGCGGG + // + // suffix_sequence AAAACCGCGTAGCGGG prefix_sequence GCGGGGCGGGTACGTAACGGTACG + // ||||| |||||||||| + // inserted sequence: GCGGGGCGGG GCGGGGCGGG + // -> tandem_dup_count = 5, inserted_bases = GCGGG + length = 10; + pos_ref = 16; + pos_read = 14; + inserted_bases = query_sequence | seqan3::views::slice(pos_read, pos_read + length); // "GCGGG"_dna5 + + duplicated_bases = detect_tandem_duplication(query_sequence, + length, + pos_ref, + pos_read, + inserted_bases, + 4, // minimal length of duplication + pos_start_dup_seq, + pos_end_dup_seq, + tandem_dup_count); + EXPECT_EQ(tandem_dup_count, 5); + EXPECT_EQ(pos_start_dup_seq, 10); + EXPECT_EQ(pos_end_dup_seq, 25); + + // Case with errors at the end: The duplication (insertion) includes some errors at the end. + // (10, 20] -> pos_start_dup_seq, pos_end_dup_seq + // ref AAAACCGCGTAGCGGGGCGGG---------------TACGTAACGGTACG + // ||||||||||||||||||| |||||||| + // read AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCAAATACGTAAC -> inserted sequence: GCGGGGCGGGGCAAA + // + // suffix_sequence AAAACCGCGTAGCGGGGCGGG + // |||||||||| + // inserted sequence: GCGGGGCGGGGCAAA + // + // suffix_sequence AAAACCGCGTAGCGGGGCGGG + // ||--- -> subsequence does not match without mismatches + // inserted sequence: GCAAA + // + seqan3::dna5_vector const & query_sequence_2 = {"AACCGCGTAGCGGGGCGGGGCGGGGCGGGGCAAATACGTAAC"_dna5}; + length = 15; + pos_ref = 21; + pos_read = 19; + inserted_bases = query_sequence_2 | seqan3::views::slice(pos_read, pos_read + length); // "GCGGGGCGGGGCAAA"_dna5 + + duplicated_bases = detect_tandem_duplication(query_sequence_2, + length, + pos_ref, + pos_read, + inserted_bases, + 4, // minimal length of duplication + pos_start_dup_seq, + pos_end_dup_seq, + tandem_dup_count); + EXPECT_EQ(tandem_dup_count, 0); + EXPECT_EQ(pos_start_dup_seq, 20); + EXPECT_EQ(pos_end_dup_seq, 20); + + // Case with a single error: The duplication (insertion) includes a single errors. + // (10, 20] -> pos_start_dup_seq, pos_end_dup_seq + // ref AAAACCGCGTAGCGGGGCGGG---------------TACGTAACGGTACG + // ||||||||||||||||||| |||||||| + // read AACCGCGTAGCGGGGCGGGGCGAGGCGGGGCGGGTACGTAAC -> inserted sequence: GCGAGGCGGGGCGGG + // + // suffix_sequence AAAACCGCGTAGCGGGGCGGG + // |||-|||||| -> single mismatch in the alignment + // inserted sequence: GCGAGGCGGGGCGGG + // + seqan3::dna5_vector const & query_sequence_3 = {"AACCGCGTAGCGGGGCGGGGCGAGGCGGGGCGGGTACGTAAC"_dna5}; + inserted_bases = query_sequence_3 | seqan3::views::slice(pos_read, pos_read + length); // "GCGAGGCGGGGCGGG"_dna5 + + duplicated_bases = detect_tandem_duplication(query_sequence_3, + length, + pos_ref, + pos_read, + inserted_bases, + 4, // minimal length of duplication + pos_start_dup_seq, + pos_end_dup_seq, + tandem_dup_count); + EXPECT_EQ(tandem_dup_count, 0); + EXPECT_EQ(pos_start_dup_seq, 20); + EXPECT_EQ(pos_end_dup_seq, 20); +} + TEST(junction_detection, cigar_string_simple_ins) { std::string const read_name = "read021"; diff --git a/test/api/structures_test.cpp b/test/api/structures_test.cpp index 243e6b85..3ce22982 100644 --- a/test/api/structures_test.cpp +++ b/test/api/structures_test.cpp @@ -2,11 +2,14 @@ #include "structures/aligned_segment.hpp" #include "structures/breakend.hpp" +#include "structures/cluster.hpp" +#include "structures/junction.hpp" #include "variant_detection/method_enums.hpp" /* tests for aligned_segment */ using seqan3::operator""_cigar_operation; +using seqan3::operator""_dna5; TEST(structures, aligned_segment) { @@ -104,7 +107,7 @@ TEST(structures, method_enums_refinement_methods) /* tests for junctions */ -TEST(structures, breakend_flip_orientation) +TEST(structures, junctions_breakend_flip_orientation) { Breakend forward_breakend{"chr1", 42, strand::forward}; Breakend reverse_breakend{"chr1", 42, strand::reverse}; @@ -117,3 +120,41 @@ TEST(structures, breakend_flip_orientation) forward_breakend.flip_orientation(); EXPECT_EQ(forward_breakend, reverse_breakend); // both are forward now } + +/* tests for clusters */ + +TEST(structures, clusters_get_common_tandem_dup_count) +{ + Junction junction_1{Breakend{"chr1", 123456, strand::forward}, + Breakend{"chr1", 234567, strand::forward}, + ""_dna5, 0, "read_1"}; + Junction junction_2{Breakend{"chr1", 123456, strand::forward}, + Breakend{"chr1", 234567, strand::forward}, + ""_dna5, 1, "read_1"}; + Junction junction_3{Breakend{"chr1", 123456, strand::forward}, + Breakend{"chr1", 234567, strand::forward}, + ""_dna5, 2, "read_1"}; + Junction junction_4{Breakend{"chr1", 123456, strand::forward}, + Breakend{"chr1", 234567, strand::forward}, + ""_dna5, 7, "read_1"}; + + // Tandem duplication with 2 copies: + Cluster cluster_1{{junction_3, junction_3, junction_3}}; + EXPECT_EQ(cluster_1.get_common_tandem_dup_count(), 2); + + // Tandem duplication with different amount of copies: + Cluster cluster_2{{junction_2, junction_3, junction_4}}; + EXPECT_EQ(cluster_2.get_common_tandem_dup_count(), 3); // (1+2+7)/3=3,333 + + // More than two thirds of the junctions have a 0 tandem_dup_count, than its probably no tandem duplication. + Cluster cluster_3{{junction_1, junction_1, junction_1, junction_2}}; + EXPECT_EQ(cluster_3.get_common_tandem_dup_count(), 0); + + // Two thirds of the junctions have a 0 tandem_dup_count, than its probably a tandem duplication. + Cluster cluster_4{{junction_1, junction_1, junction_2}}; + EXPECT_EQ(cluster_4.get_common_tandem_dup_count(), 0); + + // Less than two thirds of the junctions have a 0 tandem_dup_count, than its probably a tandem duplication. + Cluster cluster_5{{junction_1, junction_2}}; + EXPECT_EQ(cluster_5.get_common_tandem_dup_count(), 1); +} diff --git a/test/cli/iGenVar_cli_test.cpp b/test/cli/iGenVar_cli_test.cpp index 6f29309a..371f88ca 100644 --- a/test/cli/iGenVar_cli_test.cpp +++ b/test/cli/iGenVar_cli_test.cpp @@ -197,6 +197,7 @@ TEST_F(iGenVar_cli_test, test_verbose_option) { "Detect junctions in long reads...\n" "INS: chr21\t41972615\tForward\tchr21\t41972616\tForward\t1681\t0\tm2257/8161/CCS\n" + "\t\t\tinserted sequence: GAGTGGACCTCAGCAAACTCCCAGTAGAGCTGCAGCAGAGGGGGTCTGACTGTTAGGATGGAAAACTAACAAACAGAAGGCAATAGCATCAACAACAACAAAAAAAAACTGCCACACAAAAACCGCTATCCGAAGATCACCAACATCAAAGATCGAACAGGTAGACAAATGACGAAGAGAGGAAAAAACAGTGCAAAAAAGGCTGAAAAGCCCAGAACCACCTCTTCCCTCCAGAGGATCACAACTCCTCACCAGGAAAGGGAACAAAACTGCACAGAGAAGAGTTTGACCAATGACAGAAGTAGGCTTCAGCAGAATGGGTAATAACTCCTCTGAGCTAAAGGAGCATGTTCCTACCCTAATGCAAGGAAGCTAAAGATACTTGATAAAAGGTTACAGGAACTGCTAACTAAATAAACCAGTTCAGAGAAGAACATAAATGACCTAAATGGAGCTGAAAAACACAGCATGAGAACTTCATGAAGGAATACACAAGGTATCAACAGACAAGTCCATCAGGCAGAAGAAAGGGATATCAGAGATTGAAGATCAACTTAATGAAATAAAGCATGCAAGACGAGATTAGTGAGAAAAAAGAATTAAAAGAAATGAGCAAAGGCCTCAAGGAAATATGGGACTATGTGTAAAAGACCAAGCATACGGTTTGATTGGTGTATGTGAAAATGACAGGGAAAATGGAACCAAGTTGGAAAACACTCTTCAGGATATCATGCAGGAGAACCTCCCAACCTAGCAAGAGAAGCCAACATTCACATTCAGGAAATACAAGAGAACACCACCAAGATACTCCTTGAGAAGTAGCAAACCCCCAAGACACATAATTGTTCAGATTCAGGCAAGGGTGAAAATGAAGGAAAAAATGCTAAGAGAGCCAGAGAGAAAGGTATGGGTTATCCACAAAAGGGCCAGCCATCAGACTAAGAGCAATTCTCTGCAGAAACCCTACAACCAGAAGAGAGAAGGGGCCAATATTCAACATTCTTAAAGAAAAGAATTTTCAACCCAGAATTTCATATCCAGCCAAAACAAAGCTTCGTAAGTGAAGGAGAAATAAATTTCTTTACAGACAAGCAAATGACTGAAGAAGATTTTTGTCACCACCATGCCTGCCTTACAAGATCTCCTGAAGGAAGCACTAAGACATGGGAAGGAAAAAATCCAGTACCAAGCCACTGCTAAACCATACCAAAATGTAGAGACTCAATGCTTAGGATAGGAAACTGCATCAACTAGCAGTCAAAATAACCAGCTAGCATTCATAATGACAGGATCAAATTCAGACCACATACAATTATTAACCTTAAATGTAAATGGGCTAAATGCCGCAATTAAAAGACACATCACTGGCAAATTGGATAAAGAGTCAAGCCCAATCGGTGTGCTGTTATTCAAGGAGACACCACTCTCACGTGCAAGAGACACAGATAGGCTCGAAAATGAATAGGGATGAAGGAAGATTACCAAGCAAATGGAAAGCAAAAAAAAAAAAAGCAGGGGTTGCAAATCCTAGTCTCTGTAAAACACTACTTTAAACCAAGAAAGATCAAAAGAGACAAAGAGGGCTTTAATAATGGTAATAGGGGGATTAATTCAACAAGAAGAGTTAACTATCCTAAATATATATGCTGCCTAATACAGGCACACCCAGATTCATAAAGCA\n" "The read depth method for long reads is not yet implemented.\n" "BND: chr21\t41972615\tReverse\tchr22\t17458415\tReverse\t0\t0\tm41327/11677/CCS\n" "The read depth method for long reads is not yet implemented.\n" diff --git a/test/data/datasources.cmake b/test/data/datasources.cmake index 340e0718..a26ccb6c 100644 --- a/test/data/datasources.cmake +++ b/test/data/datasources.cmake @@ -29,19 +29,19 @@ declare_datasource (FILE single_end_mini_example.sam # copies file to /data/output_err.txt declare_datasource (FILE output_err.txt URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_err.txt - URL_HASH SHA256=606826366c63cf8ed09c0efddbc6a010dd3f9e670946f690cb1bc54d246e7fcd) + URL_HASH SHA256=b4c832bbf50cf3b9893191caa4dcc811299ec9de0270f2470bbb22a509826d6a) # copies file to /data/output_res.txt declare_datasource (FILE output_res.txt URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_res.txt - URL_HASH SHA256=71238d79330c9efcf61467c20cd998517bf755d64742f60cbbfe0a5399a6c063) + URL_HASH SHA256=6ce03518859776b951b9246c0bdf62537484cdb120de752468597f21524523de) # copies file to /data/output_short_and_long_err.txt declare_datasource (FILE output_short_and_long_err.txt URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_short_and_long_err.txt - URL_HASH SHA256=4edfef05a26c876d51fa9ecabab0ef03ce52c6e71206eb96dd5e1751a852439c) + URL_HASH SHA256=8dcede8d8b733e1ac9eb2b327c5d14d890d0c6f4e77c95ec8bf9e0d10a593a2e) # copies file to /data/output_short_and_long_res.txt declare_datasource (FILE output_short_and_long_res.txt URL ${CMAKE_SOURCE_DIR}/test/data/mini_example/output_short_and_long_res.txt - URL_HASH SHA256=55262c77c479cd4d767a43a03d4afd1528d722097de8a81d9e52805b37671e01) + URL_HASH SHA256=d17b7b163bdbbb2c36fab6ca8a6282645e1ceaf240f66c183c56ccb307d59b90) diff --git a/test/data/mini_example/output_err.txt b/test/data/mini_example/output_err.txt index 197e86d8..df940637 100644 --- a/test/data/mini_example/output_err.txt +++ b/test/data/mini_example/output_err.txt @@ -11,14 +11,18 @@ DEL: chr1 56 Forward chr1 70 Forward 0 0 read018 DUP:TANDEM: chr1 109 Forward chr1 124 Forward 0 2 read021 2 segments describe this tandem duplication. Its length on the read is 34 and a single duplicated part has a length of 16 => tandem_dup_count = 2 INS: chr1 124 Forward chr1 125 Forward 15 0 read023 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read024 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read025 + inserted sequence: CCCCGGGGCCAATTT BND: chr1 96 Forward chr1 125 Forward 0 0 read027 DUP:TANDEM: chr1 180 Forward chr1 187 Forward 0 2 read029 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2 DUP:TANDEM: chr1 180 Forward chr1 187 Forward 0 2 read030 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2 -INS: chr1 179 Forward chr1 180 Forward 8 0 read031 +DUP:TANDEM: chr1 179 Forward chr1 187 Forward 8 2 read031 + duplicated sequence: ATATATTT with 2 duplications DUP:TANDEM: chr1 180 Forward chr1 187 Forward 0 2 read033 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2 BND: chr1 180 Reverse chr1 187 Reverse 0 0 read034 @@ -38,10 +42,15 @@ DEL: chr1 335 Forward chr1 350 Forward 0 0 read043 DEL: chr1 335 Forward chr1 350 Forward 0 0 read044 DEL: chr1 335 Forward chr1 350 Forward 0 0 read045 INS: chr1 367 Forward chr1 368 Forward 11 0 read046 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read047 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read048 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read049 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read050 + inserted sequence: GGTAACGTGTA DEL: chr1 383 Forward chr1 395 Forward 0 0 read050 BND: chr1 10 Reverse chr1 470 Reverse 0 0 read051 BND: chr1 10 Reverse chr1 470 Reverse 0 0 read052 diff --git a/test/data/mini_example/output_res.txt b/test/data/mini_example/output_res.txt index 86570a17..f75bff9e 100644 --- a/test/data/mini_example/output_res.txt +++ b/test/data/mini_example/output_res.txt @@ -12,12 +12,12 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT MYSAMPLE chr1 57 . N 9 PASS END=70;SVLEN=-13;SVTYPE=DEL GT ./. chr1 97 . N 1 PASS END=125;SVLEN=-28;SVTYPE=DEL GT ./. -chr1 110 . N 1 PASS END=125;SVLEN=14;SVTYPE=DUP GT ./. +chr1 110 . N 1 PASS END=125;SVLEN=15;SVTYPE=DUP GT ./. chr1 125 . N 3 PASS END=125;SVLEN=15;SVTYPE=INS GT ./. -chr1 180 . N 1 PASS END=180;SVLEN=8;SVTYPE=INS GT ./. +chr1 180 . N 1 PASS END=188;SVLEN=8;SVTYPE=DUP GT ./. chr1 266 . N 4 PASS END=286;SVLEN=-20;SVTYPE=DEL GT ./. chr1 282 . N 1 PASS END=299;SVLEN=-17;SVTYPE=DEL GT ./. chr1 336 . N 4 PASS END=350;SVLEN=-14;SVTYPE=DEL GT ./. chr1 368 . N 5 PASS END=368;SVLEN=11;SVTYPE=INS GT ./. chr1 384 . N 1 PASS END=395;SVLEN=-11;SVTYPE=DEL GT ./. -chr1 512 . N 1 PASS END=529;SVLEN=16;SVTYPE=DUP GT ./. +chr1 512 . N 1 PASS END=529;SVLEN=17;SVTYPE=DUP GT ./. diff --git a/test/data/mini_example/output_short_and_long_err.txt b/test/data/mini_example/output_short_and_long_err.txt index 36900cef..9d6a5ef0 100644 --- a/test/data/mini_example/output_short_and_long_err.txt +++ b/test/data/mini_example/output_short_and_long_err.txt @@ -9,22 +9,39 @@ DEL: chr1 56 Forward chr1 70 Forward 0 0 read016 DEL: chr1 56 Forward chr1 70 Forward 0 0 read017 DEL: chr1 56 Forward chr1 70 Forward 0 0 read018 INS: chr1 124 Forward chr1 125 Forward 15 0 read001 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read002 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read003 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read004 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read005 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read006 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read007 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read008 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read009 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read010 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read011 -INS: chr1 179 Forward chr1 180 Forward 8 0 read013 -INS: chr1 179 Forward chr1 180 Forward 8 0 read014 -INS: chr1 179 Forward chr1 180 Forward 8 0 read015 -INS: chr1 179 Forward chr1 180 Forward 8 0 read016 -INS: chr1 179 Forward chr1 180 Forward 8 0 read017 -INS: chr1 179 Forward chr1 180 Forward 8 0 read018 + inserted sequence: CCCCGGGGCCAATTT +DUP:TANDEM: chr1 179 Forward chr1 187 Forward 8 2 read013 + duplicated sequence: ATATATTT with 2 duplications +DUP:TANDEM: chr1 179 Forward chr1 187 Forward 8 2 read014 + duplicated sequence: ATATATTT with 2 duplications +DUP:TANDEM: chr1 179 Forward chr1 187 Forward 8 2 read015 + duplicated sequence: ATATATTT with 2 duplications +DUP:TANDEM: chr1 179 Forward chr1 187 Forward 8 2 read016 + duplicated sequence: ATATATTT with 2 duplications +DUP:TANDEM: chr1 179 Forward chr1 187 Forward 8 2 read017 + duplicated sequence: ATATATTT with 2 duplications +DUP:TANDEM: chr1 179 Forward chr1 187 Forward 8 2 read018 + duplicated sequence: ATATATTT with 2 duplications Detect junctions in long reads... DEL: chr1 56 Forward chr1 70 Forward 0 0 read010 DEL: chr1 56 Forward chr1 70 Forward 0 0 read011 @@ -38,14 +55,18 @@ DEL: chr1 56 Forward chr1 70 Forward 0 0 read018 DUP:TANDEM: chr1 109 Forward chr1 124 Forward 0 2 read021 2 segments describe this tandem duplication. Its length on the read is 34 and a single duplicated part has a length of 16 => tandem_dup_count = 2 INS: chr1 124 Forward chr1 125 Forward 15 0 read023 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read024 + inserted sequence: CCCCGGGGCCAATTT INS: chr1 124 Forward chr1 125 Forward 15 0 read025 + inserted sequence: CCCCGGGGCCAATTT BND: chr1 96 Forward chr1 125 Forward 0 0 read027 DUP:TANDEM: chr1 180 Forward chr1 187 Forward 0 2 read029 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2 DUP:TANDEM: chr1 180 Forward chr1 187 Forward 0 2 read030 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2 -INS: chr1 179 Forward chr1 180 Forward 8 0 read031 +DUP:TANDEM: chr1 179 Forward chr1 187 Forward 8 2 read031 + duplicated sequence: ATATATTT with 2 duplications DUP:TANDEM: chr1 180 Forward chr1 187 Forward 0 2 read033 2 segments describe this tandem duplication. Its length on the read is 16 and a single duplicated part has a length of 8 => tandem_dup_count = 2 BND: chr1 180 Reverse chr1 187 Reverse 0 0 read034 @@ -65,10 +86,15 @@ DEL: chr1 335 Forward chr1 350 Forward 0 0 read043 DEL: chr1 335 Forward chr1 350 Forward 0 0 read044 DEL: chr1 335 Forward chr1 350 Forward 0 0 read045 INS: chr1 367 Forward chr1 368 Forward 11 0 read046 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read047 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read048 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read049 + inserted sequence: GGTAACGTGTA INS: chr1 367 Forward chr1 368 Forward 11 0 read050 + inserted sequence: GGTAACGTGTA DEL: chr1 383 Forward chr1 395 Forward 0 0 read050 BND: chr1 10 Reverse chr1 470 Reverse 0 0 read051 BND: chr1 10 Reverse chr1 470 Reverse 0 0 read052 @@ -77,5 +103,5 @@ DUP:TANDEM: chr1 511 Forward chr1 528 Forward 0 2 read053 2 segments describe this tandem duplication. Its length on the read is 39 and a single duplicated part has a length of 18 => tandem_dup_count = 2 INS: chr1 508 Reverse chr1 528 Reverse 6 0 read054 Start clustering... -Done with clustering. Found 18 junction clusters. +Done with clustering. Found 24 junction clusters. No refinement was selected. diff --git a/test/data/mini_example/output_short_and_long_res.txt b/test/data/mini_example/output_short_and_long_res.txt index 315d7459..0878792d 100644 --- a/test/data/mini_example/output_short_and_long_res.txt +++ b/test/data/mini_example/output_short_and_long_res.txt @@ -12,12 +12,18 @@ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT MYSAMPLE chr1 57 . N 18 PASS END=70;SVLEN=-13;SVTYPE=DEL GT ./. chr1 97 . N 1 PASS END=125;SVLEN=-28;SVTYPE=DEL GT ./. -chr1 110 . N 1 PASS END=125;SVLEN=14;SVTYPE=DUP GT ./. +chr1 110 . N 1 PASS END=125;SVLEN=15;SVTYPE=DUP GT ./. chr1 125 . N 14 PASS END=125;SVLEN=15;SVTYPE=INS GT ./. -chr1 180 . N 7 PASS END=180;SVLEN=8;SVTYPE=INS GT ./. +chr1 180 . N 1 PASS END=188;SVLEN=8;SVTYPE=DUP GT ./. +chr1 180 . N 1 PASS END=188;SVLEN=8;SVTYPE=DUP GT ./. +chr1 180 . N 1 PASS END=188;SVLEN=8;SVTYPE=DUP GT ./. +chr1 180 . N 1 PASS END=188;SVLEN=8;SVTYPE=DUP GT ./. +chr1 180 . N 1 PASS END=188;SVLEN=8;SVTYPE=DUP GT ./. +chr1 180 . N 1 PASS END=188;SVLEN=8;SVTYPE=DUP GT ./. +chr1 180 . N 1 PASS END=188;SVLEN=8;SVTYPE=DUP GT ./. chr1 266 . N 4 PASS END=286;SVLEN=-20;SVTYPE=DEL GT ./. chr1 282 . N 1 PASS END=299;SVLEN=-17;SVTYPE=DEL GT ./. chr1 336 . N 4 PASS END=350;SVLEN=-14;SVTYPE=DEL GT ./. chr1 368 . N 5 PASS END=368;SVLEN=11;SVTYPE=INS GT ./. chr1 384 . N 1 PASS END=395;SVLEN=-11;SVTYPE=DEL GT ./. -chr1 512 . N 1 PASS END=529;SVLEN=16;SVTYPE=DUP GT ./. +chr1 512 . N 1 PASS END=529;SVLEN=17;SVTYPE=DUP GT ./.