Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Add tandem_dup_count to Junction #147

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion include/structures/cluster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@ class Cluster
*/
Breakend get_average_mate2() const;

//! \brief Returns either the average tandem_dup_count of the inserted tandem duplications of all cluster members
// with tandem_dup_count != 0, or 0 if most (2/3) of the members have a tandem_dup_count = 0.
size_t get_common_tandem_dup_count() const;

//! \brief Returns the average length of the inserted sequences of all cluster members.
int32_t get_average_inserted_sequence_size() const;
size_t get_average_inserted_sequence_size() const;

//! \brief Returns the members of the cluster.
std::vector<Junction> get_members() const;
Expand All @@ -54,6 +58,7 @@ inline stream_t operator<<(stream_t && stream, Cluster const & clust)
stream << clust.get_average_mate1() << '\t'
<< clust.get_average_mate2() << '\t'
<< clust.get_cluster_size() << '\t'
<< clust.get_common_tandem_dup_count() << '\t'
<< clust.get_average_inserted_sequence_size();
return stream;
}
Expand Down
17 changes: 13 additions & 4 deletions include/structures/junction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class Junction
Breakend mate1{};
Breakend mate2{};
seqan3::dna5_vector inserted_sequence{};
size_t tandem_dup_count{};
std::string read_name{};

public:
Expand All @@ -29,8 +30,10 @@ class Junction
Junction(Breakend the_mate1,
Breakend the_mate2,
auto const & the_inserted_sequence,
size_t the_tandem_dup_count,
std::string the_read_name) : mate1{std::move(the_mate1)},
mate2{std::move(the_mate2)},
tandem_dup_count{the_tandem_dup_count},
read_name{std::move(the_read_name)}
{
if ((mate2.seq_name < mate1.seq_name) ||
Expand Down Expand Up @@ -60,6 +63,9 @@ class Junction
*/
seqan3::dna5_vector get_inserted_sequence() const;

//! \brief Returns the number of tandem copies of this junction.
size_t get_tandem_dup_count() const;

//! \brief Returns the name of the read giving rise to this junction.
std::string get_read_name() const;
};
Expand All @@ -70,27 +76,30 @@ inline constexpr stream_t operator<<(stream_t && stream, Junction const & junc)
stream << junc.get_mate1() << '\t'
<< junc.get_mate2() << '\t'
<< junc.get_inserted_sequence().size() << '\t'
<< junc.get_tandem_dup_count() << '\t'
<< junc.get_read_name();
return stream;
}

/*! \brief A junction is smaller than another, if their first mate, second mate, or inserted sequence (in this order)
* is smaller than the corresponding element of the other junction.
/*! \brief A junction is smaller than another, if their first mate, second mate, count of tandem duplications, or
* inserted sequence (in this order) is smaller than the corresponding element of the other junction.
*
* \param lhs left side junction
* \param rhs right side junction
*/
bool operator<(Junction const & lhs, Junction const & rhs);

/*! \brief A junction is equal to another, if their mates and the inserted sequences are equal to each other.
/*! \brief A junction is equal to another, if their mates, their count of tandem duplications and the inserted sequences
* are equal to each other.
* The read_name is allowed to be unequal, because more than one read could support the same junction.
*
* \param lhs - left side junction
* \param rhs - right side junction
*/
bool operator==(Junction const & lhs, Junction const & rhs);

/*! \brief A junction is unequal to another, if their mates or the inserted sequences are unequal to each other.
/*! \brief A junction is unequal to another, if their mates, their count of tandem duplications or the inserted
* sequences are unequal to each other.
*
* \param lhs - left side junction
* \param rhs - right side junction
Expand Down
3 changes: 2 additions & 1 deletion src/modules/clustering/hierarchical_clustering_method.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,10 @@ int junction_distance(Junction const & lhs, Junction const & rhs)
// Reference: ................
// Junction 1 with mates A and B: A------->B (2bp inserted)
// Junction 2 with mates C and D: C------>D (5bp inserted)
// Distance = 1 (distance A-C) + 2 (distance B-D) + 3 (absolute insertion size difference)
// Distance = 1 (distance A-C) + 2 (distance B-D) + 0 (no tandem duplication) + 3 (absolute insertion size difference)
return (std::abs(lhs.get_mate1().position - rhs.get_mate1().position) +
std::abs(lhs.get_mate2().position - rhs.get_mate2().position) +
std::abs((int)(lhs.get_tandem_dup_count() - rhs.get_tandem_dup_count())) +
std::abs((int)(lhs.get_inserted_sequence().size() - rhs.get_inserted_sequence().size())));
}
else
Expand Down
3 changes: 3 additions & 0 deletions src/modules/sv_detection_methods/analyze_cigar_method.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void analyze_cigar(std::string const & read_name,
using seqan3::get;
int32_t length = get<0>(pair);
seqan3::cigar::operation operation = get<1>(pair);
size_t tandem_dup_count = 0;
if (operation == 'M'_cigar_operation || operation == '='_cigar_operation || operation == 'X'_cigar_operation)
{
pos_ref += length;
Expand All @@ -40,6 +41,7 @@ void analyze_cigar(std::string const & read_name,
Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward},
Breakend{chromosome, pos_ref, strand::forward},
inserted_bases,
tandem_dup_count,
read_name};
seqan3::debug_stream << "INS: " << new_junction << "\n";
junctions.push_back(std::move(new_junction));
Expand All @@ -54,6 +56,7 @@ void analyze_cigar(std::string const & read_name,
Junction new_junction{Breakend{chromosome, pos_ref - 1, strand::forward},
Breakend{chromosome, pos_ref + length, strand::forward},
""_dna5,
tandem_dup_count,
read_name};
seqan3::debug_stream << "DEL: " << new_junction << "\n";
junctions.push_back(std::move(new_junction));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ void analyze_aligned_segments(std::vector<AlignedSegment> const & aligned_segmen
int32_t const min_length,
int32_t const max_overlap)
{
size_t tandem_dup_count = 0;
for (size_t i = 1; i < aligned_segments.size(); i++)
{
AlignedSegment current = aligned_segments[i-1];
Expand Down Expand Up @@ -110,13 +111,13 @@ void analyze_aligned_segments(std::vector<AlignedSegment> const & aligned_segmen
if (distance_on_read < 0)
{
// No inserted sequence between overlapping alignment segments
junctions.emplace_back(mate1, mate2, ""_dna5, read_name);
junctions.emplace_back(mate1, mate2, ""_dna5, tandem_dup_count, read_name);
}
else
{
auto inserted_bases = query_sequence | seqan3::views::slice(current.get_query_end(),
next.get_query_start());
junctions.emplace_back(mate1, mate2, inserted_bases, read_name);
junctions.emplace_back(mate1, mate2, inserted_bases, tandem_dup_count, read_name);
}
seqan3::debug_stream << "BND: " << junctions.back() << "\n";
}
Expand Down
31 changes: 27 additions & 4 deletions src/structures/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,36 @@ Breakend Cluster::get_average_mate2() const
return average_breakend;
}

int32_t Cluster::get_average_inserted_sequence_size() const
size_t Cluster::get_common_tandem_dup_count() const
{
uint64_t sum_sizes = 0;
size_t sum_counts = 0;
size_t amount_zero_counts = 0;
// Iterate through members of the cluster
for (size_t i = 0; i < members.size(); ++i)
{
size_t current_count = members[i].get_tandem_dup_count();
if (current_count == 0)
++amount_zero_counts;
else
sum_counts += members[i].get_tandem_dup_count();
}
// TODO (irallia 12.08.21): This 3 is free choosen and other values could be tested.
// If two thirds of the junctions have a 0 tandem_dup_count, than its probably no tandem duplication.
if (amount_zero_counts > std::round(members.size() / 3.0))
return 0;
else
return std::round(static_cast<double>(sum_counts) / members.size());
}

size_t Cluster::get_average_inserted_sequence_size() const
{
size_t sum_sizes = 0;
// Iterate through members of the cluster
for (size_t i = 0; i < members.size(); ++i)
{
sum_sizes += members[i].get_inserted_sequence().size();
}
int32_t average_size = std::round(static_cast<double>(sum_sizes) / members.size());
size_t average_size = std::round(static_cast<double>(sum_sizes) / members.size());
return average_size;
}

Expand All @@ -93,7 +114,9 @@ bool operator<(Cluster const & lhs, Cluster const & rhs)
? lhs.get_average_mate1() < rhs.get_average_mate1()
: lhs.get_average_mate2() != rhs.get_average_mate2()
? lhs.get_average_mate2() < rhs.get_average_mate2()
: lhs.get_average_inserted_sequence_size() < rhs.get_average_inserted_sequence_size();
: lhs.get_common_tandem_dup_count() != rhs.get_common_tandem_dup_count()
? lhs.get_common_tandem_dup_count() < rhs.get_common_tandem_dup_count()
: lhs.get_average_inserted_sequence_size() < rhs.get_average_inserted_sequence_size();
}

bool operator==(Cluster const & lhs, Cluster const & rhs)
Expand Down
14 changes: 12 additions & 2 deletions src/structures/junction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ seqan3::dna5_vector Junction::get_inserted_sequence() const
return inserted_sequence;
}

size_t Junction::get_tandem_dup_count() const
{
return tandem_dup_count;
}

std::string Junction::get_read_name() const
{
return read_name;
Expand All @@ -26,12 +31,17 @@ bool operator<(Junction const & lhs, Junction const & rhs)
? lhs.get_mate1() < rhs.get_mate1()
: lhs.get_mate2() != rhs.get_mate2()
? lhs.get_mate2() < rhs.get_mate2()
: lhs.get_inserted_sequence() < rhs.get_inserted_sequence();
: lhs.get_tandem_dup_count() != rhs.get_tandem_dup_count()
? lhs.get_tandem_dup_count() < rhs.get_tandem_dup_count()
: lhs.get_inserted_sequence() < rhs.get_inserted_sequence();
}

bool operator==(Junction const & lhs, Junction const & rhs)
{
return (lhs.get_mate1() == rhs.get_mate1()) && (lhs.get_mate2() == rhs.get_mate2()) && (lhs.get_inserted_sequence() == rhs.get_inserted_sequence());
return (lhs.get_mate1() == rhs.get_mate1()) &&
(lhs.get_mate2() == rhs.get_mate2()) &&
(lhs.get_tandem_dup_count() == rhs.get_tandem_dup_count()) &&
(lhs.get_inserted_sequence() == rhs.get_inserted_sequence());
}

bool operator!=(Junction const & lhs, Junction const & rhs)
Expand Down
2 changes: 1 addition & 1 deletion src/variant_detection/variant_output.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ void find_and_output_variants(std::map<std::string, int32_t> & references_length
{
int32_t mate1_pos = mate1.position;
int32_t mate2_pos = mate2.position;
int32_t insert_size = clusters[i].get_average_inserted_sequence_size();
size_t insert_size = clusters[i].get_average_inserted_sequence_size();
if (mate1.orientation == strand::forward)
{
int32_t distance = mate2_pos - mate1_pos;
Expand Down
Loading